In [2]:
import pandas as pd
import numpy as py
from preprocess_helper import PorterStemmer
import os
import re
from bs4 import BeautifulSoup
from string import punctuation
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wangs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wangs\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
text = word_tokenize("c# is a language")

In [5]:
text[0]

'c'

In [31]:
class Dataset_Preprocess:
    def __init__(self):
        self.stopwords_list = None
        self.tag_list = None
        self.reservedkeywords = None
    
    def create_reservedkeywords(self, path, num_words = 100):
        df = pd.read_csv(path)
        df.sort_values(by=['Count'], ascending=False, inplace=True, ignore_index=True)
        self.tag_list = list(df["Tag"].values)
        reservedkeywords_extra = ["c#","f#","c++","node.js","nodejs",".json",".js",".net","objective-c",
                                  "asp.net","ruby-on-rails","angular.js"]
        self.reservedkeywords = list(df["Tag"].values)[:num_words]
        self.reservedkeywords.extend(reservedkeywords_extra)
        self.reservedkeywords = set(self.reservedkeywords)
        #print(self.reservedkeywords)
    
    def create_stopwords(self):
        self.stopwords_list = stopwords.words('english')
        stop_words_extra = ["i'd","sometime","sometimes","something","someone","somebody","anything","anyone","anybody",
                            "everytime","everything","everyone","everybody","e.g.","e.g","e.g.,","i.e.","i.e","i.e.,","love",
                            "know","'s","wonder"]
        self.stopwords_list.extend(stop_words_extra)
        stopwords_unsure_list = set(self.stopwords_list).intersection(set(self.tag_list))
        self.stopwords_list = set(self.stopwords_list).difference(stopwords_unsure_list)
        #print(len(self.stopwords_list),"stopwords",self.stopwords_list)
    
    def remove_non_ascii(self,sentence):
        return ''.join(char for char in sentence if ord(char) < 128)
    
    def html_Filter(self, sentence):
        sentence = BeautifulSoup(sentence, "lxml").text
        #print("after html_Filter",sentence)
    
        return sentence
    
    def keywords_transform(self, sentence):
        sentence = sentence.lower()
        sentence = sentence.replace("node js","node.js")
        sentence = sentence.replace("objective c","objective-c")
        sentence = sentence.replace("ruby on rails","ruby-on-rails")
        sentence = sentence.replace("angular js","angular-js")
        
        return sentence
    
    def POSTag_generation(self, sentence):
        sentence_clean = []
        
        #split text into indiviual sentences for better pos tagging
        sentence_list = sent_tokenize(sentence)

        for sentence in sentence_list:
            text = word_tokenize(sentence)
            text_new = []
            skip_num = False
            
            for i,word in enumerate(text):
                #keep "c#" and "F#"
                
                if skip_num:
                    skip_num = False
                    continue
                
                if word == "c" or word == "f":
                    if i+1 <len(text) and text[i+1] == "#":
                        text_new.append(word+"#")
                        skip_num = True
                    else:
                        text_new.append(word)
                else:
                    text_new.append(word)
            
            #print(text_new)
            pos_words = nltk.pos_tag(text_new)
         
            for i,pos_word in enumerate(pos_words):
                if pos_words[i][0] == pos_words[i][1]:
                    sentence_clean.append(pos_words[i][0])
                else:
                    sentence_clean.append(pos_words[i][0]+"_"+pos_words[i][1])
         
        #return re.sub(r'\W+', ' ', ' '.join(sentence_clean))
        
        #print("after POSTag_Removal",' '.join(sentence_clean))
        return ' '.join(sentence_clean)
    
    def remove_specialchar(self, sentence, char_to_keep = {'#','+','.','-'}):
        punct_set = set(punctuation).difference(char_to_keep)

        for i in punct_set:
            # Replace the special character with an empty string
            sentence=sentence.replace(i," ")
        
        return sentence
        
    def sentence_stem(self, sentence):
        p = PorterStemmer()
        output = ""

        for token in sentence.split(' '):
            if token.isalnum():
                output += p.stem(token, 0,len(token)-1)+' '
            elif token in self.reservedkeywords:
                output += token+' '
        
        #print("after sentence_stem", output.strip())
        return output.strip()
    
    def load_dataset(self, path, data_selectedId=None):
        # creating cleaned input, output pairs
        print("Start loading data...")
        allfiles = os.listdir(path)
        
        if data_selectedId == None:
            data_selectedId = [i for i in range(len(allfiles))]
        
        questions = []
        
        for filename in data_selectedId:
            text = ""
            #retreive qa_text
            try:
                with open(path+str(filename+1), 'r', encoding="utf-8") as f:
                    for line in f.readlines():
                        line = line.replace("\n","")
                        line = line.strip()
                        if line == "":
                            continue

                        text += line+" "
    
                questions.append(text.strip())
            except:
                print(str(filename+1),"is missing!")
            
            print("Loaded",len(questions),)
        return questions

    def call(self, dataset = None, keywords = None):
        data_clean = []
        tag_path = "../StackExchange/final_data/tag_dict.csv"
        
        if dataset == None:
            file_path = "../StackExchange/final_data/rawdata/"
            with open("../StackExchange/final_data/selected_id.txt", 'r') as f:
                all_ids = f.readlines()

            data_selectedId = sorted([int(tid.replace("\n","")) for tid in all_ids])
            
            dataset = load_dataset(file_path, data_selectedId)
        
        #initialize tag_list
        if keywords == None:
            self.create_reservedkeywords(tag_path)
        else:
            df = pd.read_csv(tag_path)
            self.tag_list = list(df["Tag"].values)
            self.reservedkeywords = keywords
        
        #initialize stopwords
        self.create_stopwords()
        
        for sentence in dataset:
            sentence = self.html_Filter(sentence)
            sentence = self.remove_non_ascii(sentence)
            sentence = self.keywords_transform(sentence)
            sentence = self.POSTag_generation(sentence)
            #sentence = self.remove_specialchar(sentence)
            #sentence = self.sentence_stem(sentence)
            
            data_clean.append(sentence.strip())
        
        return data_clean

In [13]:
data_preprocess1 = Dataset_Preprocess()
test1 = ["f#, c#, c++ are the optional languages.","how to use form node js feature in .net platform","is objective c open-source?"]
data_preprocess1.call(dataset=test1)

['f#', ',', 'c#', ',', 'c++', 'are', 'the', 'optional', 'languages', '.']
['how', 'to', 'use', 'form', 'node.js', 'feature', 'in', '.net', 'platform']
['is', 'objective-c', 'open-source', '?']


['f#_NN , c#_NN , c++_NN are_VBP the_DT optional_JJ languages_NNS .',
 'how_WRB to_TO use_VB form_JJ node.js_JJ feature_NN in_IN .net_NN platform_NN',
 'is_VBZ objective-c_JJ open-source_NN ?_.']

## import data

In [23]:
df = pd.read_csv("../../ISPY_data/ispy_annotation.csv")
df.fillna("N",inplace=True)

In [24]:
df_samples = df[df["issue?"] == "Y"]
df_samples.reset_index(drop=True, inplace=True)

In [25]:
len(df_samples)

737

In [26]:
df_samples.head()

Unnamed: 0,issue?,Issues,PS,ETD,TS,AR,Q,Complete?,Complete(PS+ETD+Q)?
0,Y,Hello! I was facing an issue yesterday only wh...,Y,N,N,N,N,N,N
1,Y,"Hi, I tried to add the artifactorg.nd4j:nd4j-n...",Y,N,N,N,Y,N,N
2,Y,"Hi All,i am facing one issue. Our developers u...",Y,N,N,N,Y,N,N
3,Y,"hi all, i am facing some issue with typescript...",Y,N,Y,N,N,N,N
4,Y,"Hi, I am using Typescript in a react project, ...",Y,N,N,N,Y,N,N


In [33]:
data_preprocess = Dataset_Preprocess()
issues_clean = data_preprocess.call(dataset=list(df_samples["Issues"].values))

In [34]:
issues_clean

["hello_NN !_. i_NN was_VBD facing_VBG an_DT issue_NN yesterday_NN only_RB when_WRB i_JJ run_VBP the_DT tests_NNS in_IN android_JJ locally_RB . the_DT tests_NNS are_VBP running_VBG in_IN pipeline_NN with_IN the_DT same_JJ setup_NN . the_DT issue_NN is_VBZ : [_JJ <_NNP -code-_NNP >_NNP ]_NNP i_NN do_VBP n't_RB use_VB appium_JJ desktop_NN . my_PRP$ wdio.shared.conf.js_NN is_VBZ [_JJ <_NNP -code-_NNP >_NNP ]_NNP and_CC the_DT setup_NN for_IN android_JJ local_JJ is_VBZ [_JJ <_NNP -code-_NNP >_NNP ]_NNP [_NNP <_NNP -code-_NNP >_NNP ]_NNP [_NNP <_NNP -code-_NNP >_NNP ]_NN",
 "hi_NN , i_RB tried_VBD to_TO add_VB the_DT artifactorg.nd4j_NN : nd4j-native-platform:1.0.0-alphato_JJ my_PRP$ libraries_NNS on_IN azure_NN databricks_NNS but_CC i_VBP always_RB get_VB this_DT error_NN , does_VBZ anyone_NN have_VB a_DT clue_NN about_IN how_WRB to_TO fix_VB this_DT ?_. [_JJ <_NNP -code-_NNP >_NNP ]_NNP alexdblack_NN : all_DT right_NN , it_PRP accepts_VBZ to_TO add_VB thend4j-native_JJ . on_IN dl4j_NN 's_

In [35]:
df_samples["issue_clean"] = issues_clean
df_samples.to_csv("../../ISPY_data/ispy_annotation_clean.csv",index = None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples["issue_clean"] = issues_clean
