In [2]:
import pandas as pd
import numpy as py
from preprocess_helper import PorterStemmer
import os
import re
from bs4 import BeautifulSoup
from string import punctuation
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wangs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wangs\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [60]:
def hasAlphanumeric(term):
    for letter in term:
        if letter.isalnum():
            return True
    
    return False

In [79]:
class Dataset_Preprocess:
    def __init__(self):
        self.stopwords_list = None
        self.tag_list = None
        self.reservedkeywords = None
    
    def create_reservedkeywords(self, path, num_words = 100):
        df = pd.read_csv(path)
        df.sort_values(by=['Count'], ascending=False, inplace=True, ignore_index=True)
        self.tag_list = list(df["Tag"].values)
        reservedkeywords_extra = ["c#","f#","c++","node.js","nodejs",".json",".js",".net","objective-c",
                                  "asp.net","ruby-on-rails","angular.js"]
        self.reservedkeywords = list(df["Tag"].values)[:num_words]
        self.reservedkeywords.extend(reservedkeywords_extra)
        self.reservedkeywords = set(self.reservedkeywords)
        #print(self.reservedkeywords)
    
    def create_stopwords(self):
        self.stopwords_list = stopwords.words('english')
        stop_words_extra = ["i'd","sometime","sometimes","something","someone","somebody","anything","anyone","anybody",
                            "everytime","everything","everyone","everybody","e.g.","e.g","e.g.,","i.e.","i.e","i.e.,","love",
                            "know","'s","wonder"]
        self.stopwords_list.extend(stop_words_extra)
        stopwords_unsure_list = set(self.stopwords_list).intersection(set(self.tag_list))
        self.stopwords_list = set(self.stopwords_list).difference(stopwords_unsure_list)
        #print(len(self.stopwords_list),"stopwords",self.stopwords_list)
    
    def remove_non_ascii(self,sentence):
        return ''.join(char for char in sentence if ord(char) < 128)
    
    def html_Filter(self, sentence):
        sentence = BeautifulSoup(sentence, "lxml").text
        #print("after html_Filter",sentence)
    
        return sentence
    
    def keywords_transform(self, sentence):
        sentence = sentence.lower()
        sentence = sentence.replace("node js","node.js")
        sentence = sentence.replace("objective c","objective-c")
        sentence = sentence.replace("ruby on rails","ruby-on-rails")
        sentence = sentence.replace("angular js","angular-js")
        
        return sentence
    
    def POSTag_generation(self, sentence):
        sentence_clean = []
        pos = []
        #split text into indiviual sentences for better pos tagging
        sentence_list = sent_tokenize(sentence)

        for sentence in sentence_list:
            text = word_tokenize(sentence)
            text_new = []
            skip_num = False
            
            for i,word in enumerate(text):
                #keep "c#" and "F#"
                
                if skip_num:
                    skip_num = False
                    continue
                
                if word == "c" or word == "f":
                    if i+1 <len(text) and text[i+1] == "#":
                        text_new.append(word+"#")
                        skip_num = True
                    else:
                        text_new.append(word)
                else:
                    text_new.append(word)
            
            #print(text_new)
            pos_words = nltk.pos_tag(text_new)
         
            for i,pos_word in enumerate(pos_words):
                if hasAlphanumeric(pos_words[i][0]) and pos_words[i][0] not in self.stopwords_list:
                    sentence_clean.append(pos_words[i][0])
                    pos.append(pos_words[i][1])
         
        #return re.sub(r'\W+', ' ', ' '.join(sentence_clean))
        
        #print("after POSTag_Removal",' '.join(sentence_clean))
        return ' '.join(sentence_clean),' '.join(pos)
    
    def POSTag_generation1(self, sentence):
        sentence_clean = []
        
        #split text into indiviual sentences for better pos tagging
        sentence_list = sent_tokenize(sentence)

        for sentence in sentence_list:
            text = word_tokenize(sentence)
            text_new = []
            skip_num = False
            
            for i,word in enumerate(text):
                #keep "c#" and "F#"
                
                if skip_num:
                    skip_num = False
                    continue
                
                if word == "c" or word == "f":
                    if i+1 <len(text) and text[i+1] == "#":
                        text_new.append(word+"#")
                        skip_num = True
                    else:
                        text_new.append(word)
                else:
                    text_new.append(word)
            
            #print(text_new)
            pos_words = nltk.pos_tag(text_new)
         
            for i,pos_word in enumerate(pos_words):
                if pos_words[i][0] == pos_words[i][1]:
                    sentence_clean.append(pos_words[i][0])
                else:
                    sentence_clean.append(pos_words[i][0]+"_"+pos_words[i][1])
         
        #return re.sub(r'\W+', ' ', ' '.join(sentence_clean))
        
        #print("after POSTag_Removal",' '.join(sentence_clean))
        return ' '.join(sentence_clean)
    
    def remove_specialchar(self, sentence, char_to_keep = {'#','+','.','-','\'','"',':','?','!',',','_'}):
        punct_set = set(punctuation).difference(char_to_keep)

        for i in punct_set:
            # Replace the special character with an empty string
            sentence=sentence.replace(i," ")
        
        return sentence
        
    def sentence_stem(self, sentence):
        p = PorterStemmer()
        output = ""

        for token in sentence.split(' '):
            if token.isalnum():
                output += p.stem(token, 0,len(token)-1)+' '
            elif token in self.reservedkeywords:
                output += token+' '
        
        #print("after sentence_stem", output.strip())
        return output.strip()
    
    def load_dataset(self, path, data_selectedId=None):
        # creating cleaned input, output pairs
        print("Start loading data...")
        allfiles = os.listdir(path)
        
        if data_selectedId == None:
            data_selectedId = [i for i in range(len(allfiles))]
        
        questions = []
        
        for filename in data_selectedId:
            text = ""
            #retreive qa_text
            try:
                with open(path+str(filename+1), 'r', encoding="utf-8") as f:
                    for line in f.readlines():
                        line = line.replace("\n","")
                        line = line.strip()
                        if line == "":
                            continue

                        text += line+" "
    
                questions.append(text.strip())
            except:
                print(str(filename+1),"is missing!")
            
            print("Loaded",len(questions),)
        return questions

    def call(self, dataset = None, keywords = None):
        data_clean = []
        pos = []
        tag_path = "../StackExchange/final_data/tag_dict.csv"
        
        if dataset == None:
            file_path = "../StackExchange/final_data/rawdata/"
            with open("../StackExchange/final_data/selected_id.txt", 'r') as f:
                all_ids = f.readlines()

            data_selectedId = sorted([int(tid.replace("\n","")) for tid in all_ids])
            
            dataset = load_dataset(file_path, data_selectedId)
        
        #initialize tag_list
        if keywords == None:
            self.create_reservedkeywords(tag_path)
        else:
            df = pd.read_csv(tag_path)
            self.tag_list = list(df["Tag"].values)
            self.reservedkeywords = keywords
        
        #initialize stopwords
        self.create_stopwords()
        
        for sentence in dataset:
            sentence = self.html_Filter(sentence)
            sentence = self.remove_non_ascii(sentence)
            sentence = self.keywords_transform(sentence)
            sentence = self.remove_specialchar(sentence)
            sentence,posTag = self.POSTag_generation(sentence)
            #sentence = self.remove_specialchar(sentence)
            #sentence = self.sentence_stem(sentence)
            
            data_clean.append(sentence.strip())
            pos.append(posTag.strip())
        
        return data_clean,pos

In [3]:
data_preprocess1 = Dataset_Preprocess()
test1 = ["im trying to do","I'm trying to set a property in my component after it's been loaded","I'm trying to import a model from ssd","I was trying to build my dockerfile locally with an intention to push it into my private registry running on port 5000","Hey, is there native compiler for typescript? I see that on github compiler is written in typescript, and it takes a lot of time to distribute (and I guess native one would be faster?). By native I mean something written in c++ or c# compiled to dll or exe."]
data_preprocess1.call(dataset=test1)

(['im trying to do',
  "i 'm trying to set a property in my component after it 's been loaded",
  "i 'm trying to import a model from ssd",
  'i was trying to build my dockerfile locally with an intention to push it into my private registry running on port 5000',
  'hey , is there native compiler for typescript ? i see that on github compiler is written in typescript , and it takes a lot of time to distribute ( and i guess native one would be faster ? ) . by native i mean something written in c++ or c# compiled to dll or exe .'],
 ['NN VBG TO VB',
  'NN VBP VBG TO VB DT NN IN PRP$ NN IN PRP VBZ VBN JJ',
  'NN VBP VBG TO VB DT NN IN NN',
  'NN VBD VBG TO VB PRP$ NN RB IN DT NN TO VB PRP IN PRP$ JJ NN VBG IN NN CD',
  'NN , VBZ RB JJ NN IN NN . NN VBP IN IN NN NN VBZ VBN IN NN , CC PRP VBZ DT NN IN NN TO VB ( CC VB NN JJ CD MD VB JJR . ) . IN JJ NNS VBP NN VBN IN NN CC NN VBN TO VB CC VB .'])

## import data

In [80]:
df = pd.read_csv("../chat_pattern/chat_annotation_1000_pos.csv")
df.fillna("N",inplace=True)

In [81]:
df.head()

Unnamed: 0,issue,POS,ETD_sent,Pattern for ETD,predicted_ETD,Comment_ETD,y_ETD,y'_ETD,accuracy_ETD,PS_sent,Pattern for PS,predicted_PS,Comment_PS,y_PS,y'_PS,accuracy_PS,TS_sent
0,"Hey guys, I'm trying to set a property in my c...","Hey-UH guys-NNS , I-PRP 'm-VBP trying-VBG to-T...",T1: I'm trying to set a property in my compone...,ETD_TRYING_TO,"ETD_TRYING_TO,",N,1,1,1,But everywhere I try to do it messes with chan...,PS_PROBLEM,"PS_PROBLEM,",N,N,N,N,N
1,"Hi, I'm following tutorials from angular.io an...","Hi-UH , I-PRP 'm-VBP following-VBG tutorials-N...",Can I configure building anyhow that I don't h...,ETD_CAN_QUESTION,"ETD_CAN_QUESTION,",N,1,1,1,T1: When I move it somewhere else I have probl...,PS_PROBLEM,"PS_NEG_ADV_ADJ,PS_NEG_AUX_VERB,PS_VERB_ERROR,P...",N,N,N,N,N
2,Trying to come up with a generic pattern valid...,Trying-VBG to-TO come-VB up-RP with-IN a-DT ge...,T1: Trying to come up with a generic pattern v...,ETD_TRYING_TO,"ETD_INSTEAD_OF_EXP_BEHAVIOR,ETD_TRYING_TO,",N,1,1,1,T2: but running into an issue of only getting ...,PS_PROBLEM,"PS_PROBLEM,PS_NO_NOUN,PS_ONLY,",N,N,N,N,N
3,I'm sure this has been asked plenty before but...,I-PRP 'm-VBP sure-JJ this-DT has-VBZ been-VBN ...,T1: now looking to move to Angular2,ETD_LOOKING_TO,"ETD_LOOKING_TO,",N,1,1,1,N,N,N,N,N,N,N,N
4,Does anyone know why there is anngSubmitEventE...,Does-VBZ anyone-NN know-VB why-WRB there-EX is...,N,N,"ETD_LOOKING_TO,",N,0,1,0,T1: But I don't see any reason why there is a ...,"PS_NEG_AUX_VERB,PS_ONLY","PS_NEG_AUX_VERB,",N,N,N,N,N


In [82]:
df_test = pd.read_csv("../chat_pattern/chat_testing_200.csv")
df_test.fillna("N",inplace=True)
df_test.head()

Unnamed: 0,issue,PS_sent,predicted_PS,ETD_sent,predicted_ETD,y_PS,y'_PS,Acc_PS,y_ETD,y'_ETD,Acc_ETD
0,"Hi, when I am trying to apply class dynamicall...",T2: it is messing up my material css i.e I am ...,"PS_NEG_VERB,PS_NEG_AUX_ADV_ADJ,PS_NEG_AUX_VERB,",T1: when I am trying to apply class dynamicall...,"ETD_TRYING_TO,",1,1,1,1,1,1
1,is it possible to get a RouteConfig matched ag...,N,N,T1: I'm trying to create a Breadcrumb componen...,"ETD_BE_POSSIBLE_TO,ETD_TRYING_TO,ETD_WOULD_LIKE,",0,0,1,1,1,1
2,I have angular running from a .Net Core server...,T1: The server won't even load the app (by des...,"PS_NEG_AUX_VERB,",N,N,1,1,1,0,0,1
3,"Hello everyone, I want to do sub menu with sea...",T2: but i have problem in sub menu,"PS_PROBLEM,",T1: I want to do sub menu with searh like in g...,"ETD_WANT_TO,ETD_CAN_QUESTION,",1,1,1,1,1,1
4,I'm struggling with getting this logic\n \t```...,T1: I'm struggling with getting this logic,"PS_STRUGGLING,PS_VERB_NO,",T2: I wanted to set a default value to the dro...,"ETD_WANT_TO,",1,1,1,1,1,1


In [83]:
all_data = list(df["issue"].values)
all_data.extend(list(df_test["issue"].values))

In [84]:
len(all_data)

1200

In [85]:
data_preprocess = Dataset_Preprocess()
issues_clean,pos_list = data_preprocess.call(dataset=all_data)



In [86]:
issues_clean[:10]

["hey guys 'm trying set property component loaded everywhere try messes change detection any tips gist code quick look",
 "hi 'm following tutorials angular.io maybe weird question configure building anyhow n't .js .ts files folder next each causing quite mess move somewhere else problems systemjs tho",
 'trying come generic pattern validation directive since none built running issue getting string name variable passed directive instead variable any ideas',
 "'m sure this asked plenty bit javascript angular 1.x novice looking move angular2 go typescript javascript taking steep learning curve v1 v2 course",
 "anngsubmiteventemitter looking at source ofngform listens submit event emits thengsubmitemitter n't see any reason separate event",
 "hi 'm having trouble cryptic error messages router could help app two components wich contain child routes one components routing works fine navigate detail component router throws error error component contentdetail route config painstakingly going

In [87]:
pos_list[:10]

['NN NNS VBP VBG VB NN NN VBN RB NN VBZ NN NN DT NNS NN NN JJ NN',
 'NN VBP VBG NNS NN RB JJ NN NN VBG NN RB NNS JJ NNS NN IN DT VBG RB NN VBP RB RB NNS JJ NN',
 'VBG VB JJ NN NN JJ IN NN VBN VBG NN VBG NN NN JJ VBN JJ RB JJ DT NNS',
 'VBP JJ DT VBN JJ NN NN JJ CD NN VBG VB VB VB NN NN VBG JJ NN NN JJ NN NN',
 'NN VBG IN NN NN VBZ NN NN VBZ NN RB VB DT NN JJ NN',
 'NN VBP VBG NN JJ NN NNS NN MD VB NN CD NNS JJ VBP JJ NNS CD NNS VBG NNS JJ VBP NN NN NN VBZ NN NN NN NN NN NN RB VBG NN CD NNS JJ NNS VB DT NN NN VB DT NN IN CD NNS NN NNS VBN VB VBG',
 'NN VBP VBG VB NN VBZ NN VBN IN NN VB NN NN VB JJ NN VBG NN',
 'NN NN NN NNS JJ NNS VBN NN WRB NN VBN RB NN VBG NN NN VB VB NN NN JJ NNS',
 'NN MD VB JJR NN VB NN VB JJ NN VBP VB VBG NN NN VBG JJ RB NN',
 'VBP VBG NN NN NNS CD NN NNS VBG NN NN NN VBP NN VBP NN JJ JJ NN NN NN NN JJ NN NN VB NN NNP JJ POS VBP DT JJ NNS VBG NN NNS NN NN NN NN']

In [35]:
df_samples["issue_clean"] = issues_clean
df_samples.to_csv("../../ISPY_data/ispy_annotation_clean.csv",index = None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples["issue_clean"] = issues_clean


## N-gram generation

In [77]:
from sklearn.feature_extraction.text import CountVectorizer

In [92]:
#{1,2,3}-grams
count_vect = CountVectorizer(ngram_range = (1,3),binary = True)
X_train_counts = count_vect.fit_transform(issues_clean)
X_train_counts.shape

(1200, 85947)

In [93]:
count_vect.get_feature_names_out()

array(['00', '00 00', '00 00 quote_timings', ..., 'zyisp',
       'zyisp conditionop', 'zyisp conditionop greaterorequal'],
      dtype=object)

In [101]:
X_train_counts.toarray()[:1000]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [102]:
df_train = pd.DataFrame(data = X_train_counts.toarray()[:1000], columns = count_vect.get_feature_names_out())

In [103]:
df_train

Unnamed: 0,00,00 00,00 00 quote_timings,00 10,00 10 cron,00 21,00 21 20,00 one,00 one please,00 quote_timings,...,zoom out,zoom out like,zoom_0,zoom_0 gif,zpfxellose,zpfxellose val,zpfxellose val nn,zyisp,zyisp conditionop,zyisp conditionop greaterorequal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
df_test = pd.DataFrame(data = X_train_counts.toarray()[1000:], columns = count_vect.get_feature_names_out())

In [107]:
df_test

Unnamed: 0,00,00 00,00 00 quote_timings,00 10,00 10 cron,00 21,00 21 20,00 one,00 one please,00 quote_timings,...,zoom out,zoom out like,zoom_0,zoom_0 gif,zpfxellose,zpfxellose val,zpfxellose val nn,zyisp,zyisp conditionop,zyisp conditionop greaterorequal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
df_train.to_csv("../chat_pattern/training/ngram_training.csv",index =None)
df_test.to_csv("../chat_pattern/testing/ngram_testing.csv",index =None)

## POS generation

In [109]:
#{1,2,3}-grams
count_vect_pos = CountVectorizer(ngram_range = (1,3),binary = True)
X_train_pos = count_vect_pos.fit_transform(pos_list)
X_train_pos.shape

(1200, 3452)

In [110]:
count_vect_pos.get_feature_names_out()

array(['cc', 'cc jj', 'cc jj nn', ..., 'wrb vbz rb', 'wrb wrb',
       'wrb wrb rb'], dtype=object)

In [111]:
X_train_pos.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [112]:
df_train = pd.DataFrame(data = X_train_pos.toarray()[:1000], columns = count_vect_pos.get_feature_names_out())
df_test = pd.DataFrame(data = X_train_pos.toarray()[1000:], columns = count_vect_pos.get_feature_names_out())

In [113]:
df_train.to_csv("../chat_pattern/training/pos_training.csv",index =None)
df_test.to_csv("../chat_pattern/testing/pos_testing.csv",index =None)

## Appendix

In [2]:
df = pd.read_csv("C:\\Users\\wangs\\Desktop\\lucene\\angular_ispy.csv")
df

Unnamed: 0,Index,issue,solution
0,0,there isn't a.just()in Rx5 note that that's ...,Check out this . This is the documentation for...
1,1,Shouldn'tDynamicComponentLoader#loadIntoLocati...,gionkunz: so how could i add it inside the div...
2,2,Will there be a material stuff for angular 2?,Currently I am using . I wonder if I am going...
3,3,Has anyone run into a situation where ng2 inst...,justindujardin: no.. are you sure it doesn't m...
4,4,Who hereplans on writing all their angular 2 a...,seanpar203: the biggest issue in using pure JS...
...,...,...,...
5942,5942,that's a new one :),
5943,5943,I'm running angular 8.2.14 wanting to migrate ...,"I saw no example of that, even I saw lot of un..."
5944,5944,East coast here too. No celebrating just yet XD,
5945,5945,so your api is on port 8080? your http call sh...,I set up a docker environment and nginx proxy ...


In [31]:
issue_selected = [df.loc[i].issue for i in range(len(df)) if len(df.loc[i].issue) > 200]

In [32]:
len(issue_selected)

185

In [None]:
issue_selected

In [33]:
df_all = pd.read_json("../Gitter_Channels/Angular/Angular_issue.json")
df_all

Unnamed: 0,Id,asker,time,content,lines
0,1,54c89b1cdb8155e6700f2ac3,2015-03-07 21:41,Awesome!\n,1
1,2,54e6ee3b15522ed4b3dc3c24,2015-03-07 21:53,"sweet, now we can ask angular2 questions in re...",1
2,3,530cd1eb5e986b0712efb4d2,2015-03-07 22:04,Yes!\n,1
3,4,54e6ee3b15522ed4b3dc3c24,2015-03-07 22:06,we're missing the [View Source] [Improve this ...,1
4,5,530cd1eb5e986b0712efb4d2,2015-03-07 23:18,Good idea can you file an issue with repo?\n,1
...,...,...,...,...,...
422969,422970,572a669fc43b8c6019712dd4,2022-08-16 18:50,"Hey guys, do the custom webpack.config.json us...",2
422970,422971,5ea477add73408ce4fe1af23,2022-08-18 14:26,Hi there. A short question. Is there any benef...,1
422971,422972,62fe56da6da03739849b8211,2022-08-18 15:13,[![MicrosoftTeams-image (1).png](https://files...,2
422972,422973,matrix-sharonmary:matrix.org,2022-08-19 03:23,Good news Put an end to your financial proble...,1


In [34]:
df_select = df_all[(df_all.time > "2020-12-31 23:59")]
df_select.reset_index(drop=True, inplace=True)

In [35]:
df_select

Unnamed: 0,Id,asker,time,content,lines
0,419821,5ea477add73408ce4fe1af23,2021-01-01 16:14,"Happy New Year to everyone!\nGot a question, t...",2
1,419822,5d8e3450d73408ce4fcc4164,2021-01-01 16:15,Happy New Year\n,1
2,419823,5ea477add73408ce4fe1af23,2021-01-01 16:16,So I have lazy preloaded feature module which ...,2
3,419824,579228bd40f3a6eec05c03aa,2021-01-01 16:19,"@artart37 Yep, you need to import all necessar...",2
4,419825,5ea477add73408ce4fe1af23,2021-01-01 16:21,@mlc-mlapis Thank you! Thats what I was worrie...,11
...,...,...,...,...,...
3149,422970,572a669fc43b8c6019712dd4,2022-08-16 18:50,"Hey guys, do the custom webpack.config.json us...",2
3150,422971,5ea477add73408ce4fe1af23,2022-08-18 14:26,Hi there. A short question. Is there any benef...,1
3151,422972,62fe56da6da03739849b8211,2022-08-18 15:13,[![MicrosoftTeams-image (1).png](https://files...,2
3152,422973,matrix-sharonmary:matrix.org,2022-08-19 03:23,Good news Put an end to your financial proble...,1


In [36]:
issue_selected2 = [df_select.loc[i].content for i in range(len(df_select)) if len(df_select.loc[i].content) > 200]

In [37]:
len(issue_selected2)

1456

In [38]:
issue_selected.extend(issue_selected2)

In [39]:
len(issue_selected)

1641

In [40]:
df_data = pd.DataFrame()
df_data["issue"] = issue_selected
df_data.to_csv("../../openai/finaldata/angular_2016_2022.csv",index = None)

In [2]:
df = pd.read_csv("../../openai/finaldata/angular_2016_2022.csv")
df

Unnamed: 0,issue
0,"Hey guys, I'm trying to set a property in my c..."
1,"Hi, I'm following tutorials from angular.io an..."
2,Trying to come up with a generic pattern valid...
3,I'm sure this has been asked plenty before but...
4,The specific layout you choose isn't something...
...,...
1636,"Hey guys, do the custom webpack.config.json us..."
1637,Hi there. A short question. Is there any benef...
1638,[![MicrosoftTeams-image (1).png](https://files...
1639,Good news Put an end to your financial proble...


In [5]:
for i,issue in enumerate(df["issue"]):
    filename = str(i)
    while len(filename) < 4:
         filename = '0'+ filename
    with open("C:\\Users\\wangs\\Desktop\\test1\\data\\"+filename+".txt", 'w') as fp:
        fp.write(issue)
    with open("C:\\Users\\wangs\\Desktop\\test1\\data\\"+filename+".ann", 'w') as fp:
        fp.write("")
print('Done')

Done
