In [1]:
import pandas as pd
import numpy as py
from preprocess_helper import PorterStemmer
import os
import re
from bs4 import BeautifulSoup
from string import punctuation
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wangs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wangs\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Import dataset

In [84]:
htmlFilterred = []
path = "../StackExchange/final_data/rawdata/"

allfiles = os.listdir(path)
print(len(allfiles),"files")

2247623 files


In [14]:
with open("../StackExchange/final_data/selected_id.txt", 'r') as f:
    all_ids = f.readlines()

data_selectedId = sorted([int(tid.replace("\n","")) for tid in all_ids])
print(len(data_selectedId))

588644


In [86]:
questions = []
id_missing = []
for filename in data_selectedId:
    text = ""
    #retreive qa_text
    try:
        with open(path+str(filename+1), 'r', encoding="utf-8") as f:
            for line in f.readlines():
                line = line.replace("\n","")
                line = line.strip()
                if line == "":
                    continue

                text += line+" "
    
        questions.append(text.strip())
    except:
        id_missing.append(filename)
        print(str(filename+1),"is missing!")

144552 is missing!
180636 is missing!
365328 is missing!
460582 is missing!
495266 is missing!
857720 is missing!
875208 is missing!
967918 is missing!
1035619 is missing!
1186719 is missing!


In [87]:
len(questions)

588634

In [124]:
questions[:10]

['What is the difference between Intel and PPC? <p>What is the hardware and software differences between Intel and PPC Macs?</p>',
 'Turn on Back To My Mac via a Script or Command Line <p>The VPN software I use for work (<a href="http://www.lobotomo.com/products/IPSecuritas/">IPSecuritas</a>) requires me to turn off Back To My Mac to start it\'s connection, so I frequently turn off Back To My Mac in order to use my VPN connection (the program does this for me). I forget to turn it back on however and I\'d love to know if there was something I could run (script, command) to turn it back on.</p>',
 "Why doesn't Microsoft Office/2008(& later) support RTL languages? <p>I have Microsoft Office/2008 on my MacBook Pro. Office doesn't support RTL languages like Farsi and Arabic, and I know that Office/2010 (for Windows) also has the same problem.</p> <p>Do you think the lack of support is because of business competition, or some other reason?</p>",
 "Repair Disk - Start up disk options <p>I ha

In [2]:
df = pd.read_csv("../StackExchange/final_data/stackexchange_filtered.csv")
questions = list(df["Question"].values)

In [3]:
questions[:10]

['What is the difference between Intel and PPC? <p>What is the hardware and software differences between Intel and PPC Macs?</p>',
 'Turn on Back To My Mac via a Script or Command Line <p>The VPN software I use for work (<a href="http://www.lobotomo.com/products/IPSecuritas/">IPSecuritas</a>) requires me to turn off Back To My Mac to start it\'s connection, so I frequently turn off Back To My Mac in order to use my VPN connection (the program does this for me). I forget to turn it back on however and I\'d love to know if there was something I could run (script, command) to turn it back on.</p>',
 "Why doesn't Microsoft Office/2008(& later) support RTL languages? <p>I have Microsoft Office/2008 on my MacBook Pro. Office doesn't support RTL languages like Farsi and Arabic, and I know that Office/2010 (for Windows) also has the same problem.</p> <p>Do you think the lack of support is because of business competition, or some other reason?</p>",
 "Repair Disk - Start up disk options <p>I ha

## Preprocessing

In [4]:
class Dataset_Preprocess:
    def __init__(self):
        self.stopwords_list = None
        self.tag_list = None
        self.reservedkeywords = None
    
    def create_reservedkeywords(self, path, num_words = 100):
        df = pd.read_csv(path)
        df.sort_values(by=['Count'], ascending=False, inplace=True, ignore_index=True)
        self.tag_list = list(df["Tag"].values)
        reservedkeywords_extra = ["c#","f#","c++","node.js","nodejs",".json",".js",".net","objective-c",
                                  "asp.net","ruby-on-rails","angular.js"]
        self.reservedkeywords = list(df["Tag"].values)[:num_words]
        self.reservedkeywords.extend(reservedkeywords_extra)
        self.reservedkeywords = set(self.reservedkeywords)
        #print(self.reservedkeywords)
    
    def create_stopwords(self):
        self.stopwords_list = stopwords.words('english')
        stop_words_extra = ["i'd","sometime","sometimes","something","someone","somebody","anything","anyone","anybody",
                            "everytime","everything","everyone","everybody","e.g.","e.g","e.g.,","i.e.","i.e","i.e.,","love",
                            "know","'s","wonder"]
        self.stopwords_list.extend(stop_words_extra)
        stopwords_unsure_list = set(self.stopwords_list).intersection(set(self.tag_list))
        self.stopwords_list = set(self.stopwords_list).difference(stopwords_unsure_list)
        #print(len(self.stopwords_list),"stopwords",self.stopwords_list)
    
    def remove_non_ascii(self,sentence):
        return ''.join(char for char in sentence if ord(char) < 128)
    
    def html_Filter(self, sentence):
        sentence = BeautifulSoup(sentence, "lxml").text
        #print("after html_Filter",sentence)
    
        return sentence
    
    def keywords_transform(self, sentence):
        sentence = sentence.lower()
        sentence = sentence.replace("node js","node.js")
        sentence = sentence.replace("objective c","objective-c")
        sentence = sentence.replace("ruby on rails","ruby-on-rails")
        sentence = sentence.replace("angular js","angular-js")
        
        return sentence
    
    def POSTag_Removal(self, sentence, remainPOS = ["NN","NNP","NNS","NNPS","VB","VBG","VBD","VBN","VBP","VBZ"]):
        sentence_clean = []
        
        #split text into indiviual sentences for better pos tagging
        sentence_list = sent_tokenize(sentence)

        for sentence in sentence_list:
            text = word_tokenize(sentence)
            pos_words = nltk.pos_tag(text)
         
            for i,pos_word in enumerate(pos_words):
                #keep "c#" and "F#"
                word = pos_word[0]
                if word == "c" or word == "f":
                    if i+1 <len(pos_words) and pos_words[i+1][0] == "#":
                        sentence_clean.append(word+"#")
                    
                elif (word in self.reservedkeywords) or (word not in self.stopwords_list and pos_word[1] in remainPOS):
                    sentence_clean.append(word)
         
        #return re.sub(r'\W+', ' ', ' '.join(sentence_clean))
        
        #print("after POSTag_Removal",' '.join(sentence_clean))
        return ' '.join(sentence_clean)
    
    def remove_specialchar(self, sentence, char_to_keep = {'#','+','.','-'}):
        punct_set = set(punctuation).difference(char_to_keep)

        for i in punct_set:
            # Replace the special character with an empty string
            sentence=sentence.replace(i," ")
        
        return sentence
        
    def sentence_stem(self, sentence):
        p = PorterStemmer()
        output = ""

        for token in sentence.split(' '):
            if token.isalnum():
                output += p.stem(token, 0,len(token)-1)+' '
            elif token in self.reservedkeywords:
                output += token+' '
        
        #print("after sentence_stem", output.strip())
        return output.strip()
    
    def load_dataset(self, path, data_selectedId=None):
        # creating cleaned input, output pairs
        print("Start loading data...")
        allfiles = os.listdir(path)
        
        if data_selectedId == None:
            data_selectedId = [i for i in range(len(allfiles))]
        
        questions = []
        
        for filename in data_selectedId:
            text = ""
            #retreive qa_text
            try:
                with open(path+str(filename+1), 'r', encoding="utf-8") as f:
                    for line in f.readlines():
                        line = line.replace("\n","")
                        line = line.strip()
                        if line == "":
                            continue

                        text += line+" "
    
                questions.append(text.strip())
            except:
                print(str(filename+1),"is missing!")
            
            print("Loaded",len(questions),)
        return questions

    def call(self, dataset = None, keywords = None):
        data_clean = []
        tag_path = "../StackExchange/final_data/tag_dict.csv"
        
        if dataset == None:
            file_path = "../StackExchange/final_data/rawdata/"
            with open("../StackExchange/final_data/selected_id.txt", 'r') as f:
                all_ids = f.readlines()

            data_selectedId = sorted([int(tid.replace("\n","")) for tid in all_ids])
            
            dataset = load_dataset(file_path, data_selectedId)
        
        #initialize tag_list
        if keywords == None:
            self.create_reservedkeywords(tag_path)
        else:
            df = pd.read_csv(tag_path)
            self.tag_list = list(df["Tag"].values)
            self.reservedkeywords = keywords
        
        #initialize stopwords
        self.create_stopwords()
        
        for sentence in dataset:
            sentence = self.html_Filter(sentence)
            sentence = self.remove_non_ascii(sentence)
            sentence = self.keywords_transform(sentence)
            #sentence = self.POSTag_Removal(sentence)
            sentence = self.remove_specialchar(sentence)
            sentence = self.sentence_stem(sentence)
            
            data_clean.append(sentence.strip())
        
        return data_clean

In [62]:
data_preprocess1 = Dataset_Preprocess()
test1 = ["f#, c#, c9 c++ are the optional languages.","how to use form node js feature in .net platform","is objective c open-source?"]
test2 = questions[:10]
data_preprocess1.call(dataset=test2)

['differ intel ppc hardwar softwar differ intel ppc mac',
 'turn back mac script command line vpn softwar us work ipsecurita requir turn mac start connect turn mac order us vpn connect program forget turn wa run script command turn',
 'microsoft offic 2008 support rtl languag microsoft offic 2008 macbook pro offic support languag farsi arab offic 2010 window problem think lack support busi competit reason',
 'repair start disk option power failur reboot notic drive need repair util run leopard cd start order perform fix option run repair util startup disk',
 'disabl get startup sound mac make turn macbook make start nois annoi volum abil turn want sound plai disabl startup sound',
 'look screensav dashboard widget configur text file word displai',
 'load screen mbp batteri di reboot macbook year batteri toast april appl replac board repair power cabl come mbp shut expect repair power cabl wa unplug mbp shut plug turn boot state screen load bar bottom found relat web board concern mbp r

In [5]:
data_preprocess = Dataset_Preprocess()
dataset_clean = data_preprocess.call(dataset=questions)

In [6]:
len(dataset_clean)

588634

In [7]:
dataset_clean[500000:500010]

['why doe super inherit the wrong class i am look at the diamond problem and got a question class a def init self print thi is class a class b a def init self print thi is class b super init class c a def init self print thi is class c super init class d b c def init self print thi is class d super init i d thi is class d thi is class b thi is class c thi is class a it work as intend and that s nice but i would like to know why the super init in class b doesn t go to class a and instead c is if a class ha a super and it inherit from a parent class it should go if i remov it on b the code won t get to c nor i know of the mro and how it is actual go as expect mro class main class main class main class main class object but i don t know it s veri weird than the implement of thi code ha the same mro yet a is print twice class a def init self print thi is class a class b a def init self print thi is class b init self class c a def init self print thi is class c init self class d b c def ini

## (Question, target set) generation

In [10]:
df = pd.read_csv("../StackExchange/final_data/stackexchange.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [11]:
id_missing = [144551,180635,365327,460581,495265,857719,875207,967917,1035618,1186718]

In [15]:
len(data_selectedId)

588644

In [16]:
data_selectedId = set(data_selectedId).difference(set(id_missing))

In [17]:
len(data_selectedId)

588634

In [18]:
body_missing = [df.iloc[sid]["Body"] for sid in id_missing]

In [19]:
tag_missing = [df.iloc[sid]["Tags"] for sid in id_missing]

In [20]:
tag_selected = [df.iloc[sid]["Tags"] for sid in sorted(list(data_selectedId))]

In [21]:
len(tag_selected)

588634

In [82]:
df_filter = pd.DataFrame()
df_filter["Question"] = questions
df_filter["Q_preprocessed"] = dataset_clean
df_filter["Tags"] = tag_selected

In [83]:
df_filter.to_csv("../StackExchange/final_data/stackexchange_filtered.csv",index = None)
df_filter.head()

Unnamed: 0,Question,Q_preprocessed,Tags
0,What is the difference between Intel and PPC? ...,differ intel ppc hardwar softwar differ intel ...,<hardware><mac><powerpc><macos>
1,Turn on Back To My Mac via a Script or Command...,turn back mac script command line vpn softwar ...,<macos><mobileme><terminal><back-to-my-mac><sc...
2,Why doesn't Microsoft Office/2008(& later) sup...,microsoft offic 2008 support rtl languag micro...,<software><ms-office>
3,Repair Disk - Start up disk options <p>I had a...,repair start disk option power failur reboot n...,<macos><snow-leopard>
4,How do I disable or get rid of the startup sou...,disabl get startup sound mac make turn macbook...,<mac><audio><startup>


In [8]:
df_tag = pd.read_csv("../StackExchange/final_data/tag_dict.csv")
print(len(df_tag))
df_tag["Corrected"] = df_tag['Corrected'].fillna(-1)
df_tag = df_tag[df_tag["Corrected"] != -1]

28143


In [9]:
tag_corrected = {k:v for k,v in zip(df_tag["Tag"],df_tag["Corrected"])}

In [22]:
pairs = []
for text,tag_string in zip(dataset_clean,tag_selected):
    text = '<start> '+text+' <end>'
    
    tag_str = ""
    #retreive tag set
    tag_string = tag_string.replace("<","")
    tag_string = tag_string.replace(">"," ")
    
    for tag in tag_string.split(" "):
        if tag in tag_corrected.keys() and tag_corrected[tag] != "no":
            tag = tag_corrected[tag]
        tag_str += tag.strip() + " "
    
    tag_str = '<start> '+tag_str.strip().lower()+' <end>'
    
    final_str = text + "\t" + tag_str
    
    pairs.append(final_str)

In [23]:
len(pairs)

588634

In [24]:
pairs[:10]

['<start> what is the differ between intel and ppc what is the hardwar and softwar differ between intel and ppc mac <end>\t<start> hardware mac powerpc macos <end>',
 '<start> turn on back to my mac via a script or command line the vpn softwar i us for work ipsecurita requir me to turn off back to my mac to start it s connect so i frequent turn off back to my mac in order to us my vpn connect the program doe thi for me i forget to turn it back on howev and i d love to know if there wa someth i could run script command to turn it back <end>\t<start> macos mobileme terminal back-to-my-mac script <end>',
 '<start> why doesn t microsoft offic 2008 later support rtl languag i have microsoft offic 2008 on my macbook offic doesn t support rtl languag like farsi and arab and i know that offic 2010 for window also ha the same do you think the lack of support is becaus of busi competit or some other reason <end>\t<start> software microsoft-office <end>',
 '<start> repair disk start up disk optio

In [25]:
with open("../StackExchange/final_data/training/pairs_full.txt", 'w', encoding="utf-8") as f:
    for line in pairs:
        f.write(line)
        f.write("\n")

## Validation set generation

### Stackoverflow

In [4]:
df_val = pd.read_csv("../StackExchange/final_data/testing/eval_108489.csv")

In [5]:
df_val.head()

Unnamed: 0,Input,Tag_True
0,iPhone App for Displaying Email on a Locked Sc...,iphone software-recommendation
1,How can I tell when it's a good time to buy a ...,macbook-pro
2,"Password keeper for iPhone, Mac and Windows? <...",iphone software-recommendation
3,Good Newsgroup Client for OS X <p>I've just sw...,macos software-recommendation snow-leopard
4,"is there a slipcase for 17"" MacBook Pros which...",macbook-pro


In [6]:
data_preprocess = Dataset_Preprocess()
val_clean = data_preprocess.call(dataset=list(df_val["Input"].values))

In [7]:
val_clean[:10]

['iphon app displai email screen exchang support app mail notifi send notif phone get email support exchang app support exchang app notif reoccur cost',
 'tell time bui macbook want date instanc date go bui macbook pro wait month model come figur comput bui becom',
 'password keeper iphon mac window solut sync access encrypt password mac pc iphon',
 'newsgroup client x ve switch os x struggl find us reader suggest',
 'slipcas macbook pro open side carri mbp pannier fit portrait orient like protect slipcas want take slipcas pannier get laptop slipcas market open side',
 'applic iphon read bar code need read bar code us iphon applic read qrcode m look read pdf417',
 'output video tv us macbook model',
 'screen login lock system close clam make us clumsi close clam mode oper close macbook bring work plug keyboard monitor time time thing proce follow unlock box appear monitor enter password system unlock go screen mous cursor look unlock environ cours miss unlock dialog box ve seen system 

In [8]:
df_val["processed_new"] = val_clean
df_val.to_csv("../../Model/evaluation/validation_stack.csv",index = None)

### Livechat

In [91]:
df_val = pd.read_csv("../../Model/evaluation/validation_Angular_result.csv")

In [92]:
data_preprocess = Dataset_Preprocess()
val_clean = data_preprocess.call(dataset=list(df_val["Issue"].values))

In [94]:
val_clean[:10]

['dart depend inject issu',
 'ok flutter dart ionic part',
 'question googl creat languag dart us tighter js',
 'hello us router load compon pass properti bind workaround',
 'wai pass input compon insert',
 'project version angular version rxj project method figur',
 'see get http respons see get http respons',
 'hi afternoon get json field respons throw error code',
 'get respons need',
 'hi benefit respons data subscrib request']

In [95]:
df_val["processed_new"] = val_clean

In [97]:
df_val.to_csv("../../Model/evaluation/validation_Angular_new.csv",index = None)