### Data Loading

In [1]:
#import library
import pandas as pd
import glob, os, json
import re

#user input file path
path = 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/'

In [2]:
def data_loading(path,df=None,date=None):
    '''
    Load only files that follow agreed filename format, merge files as single dataframe.
    Can support incremental aggregation of dataset, by setting arg df as the existing dataframe
    Returns a single dataframe.
    
    params:
    path [string]: path of the files, without filename
    df [dataframe] (optional,default is None): input existing dataframe to merge with new files
    date ["string"](optional,default is None): user can choose to load only files from specific date in YYYY-MM-DD format
    '''
    filenames = os.listdir(path)
    file_list=[]
    dfs = []

    if df is None: #no existing dataframe
        
        for file in filenames:
            # search agreed file format pattern in the filename
            if date == None:
                pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"
                
            else:
#              
                pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
    
            match = re.search(pattern,file)
            #if match is found
            if match:
                pattern = os.path.join(path, file) #join path with file name
                file_list.append(pattern) #list of json files that follow the agreed filename

                for file in file_list:
                    with open(file) as f:
                        #flatten json into pd dataframe
                        json_data = pd.json_normalize(json.loads(f.read()))
                        #label which file each row is from 
                        json_data['file'] = file.rsplit("/", 1)[-1]

                    dfs.append(json_data)
                df = pd.concat(dfs)
                
    else: #existing dataframe exists and want to append new files to existing dataframe
             
        for file in filenames:

            if file not in df["file"].unique(): #check if file is new - to support merging of new dataset with previously read ones

                # search agreed file format pattern in the filename
                
                if date == None:
                    pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"

                else:
                    pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
                     
                match = re.search(pattern,file)

                #if match is found
                if match:
                    json_pattern = os.path.join(path, file) #join path with file name
                    file_list.append(json_pattern) #list of json files 

                    for file in file_list:
                        with open(file) as f:
                            #flatten json into pd dataframe
                            json_data = pd.json_normalize(json.loads(f.read()))
                            #label which file each row is from 
                            json_data['file'] = file.rsplit("/", 1)[-1]

                        dfs.append(json_data)
                    new_df = pd.concat(dfs)           
                    df=pd.concat([df,new_df])
    
    return df

In [3]:
os.listdir(path)

['(2021-08-25)1_firstSet_1.json',
 '(2021-08-25)3_secondSet_1.json',
 '(2021-10-11)3_secondSet_1.json',
 'data_3cols.csv']

In [4]:
#load only files that follow the agreed format, does not choose file by date
df = data_loading(path,df=None,date = None)
df

Unnamed: 0,id,title,description,comments,updated_date,hierarchy_id,rev,tenant,subject,is_current,hierarchy_path,parent_id,record_type,row_num,file
0,1308651592,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",2021-07-21 12:30:31.387,,8,iot_platf,support,1,/1201559436/1208431055/1308651592/,1208431055,parent,1,(2021-08-25)1_firstSet_1.json
1,1308671310,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,2021-05-04 09:30:00.320,,11,iot_platf,support,1,/1201559436/1208431055/1308671310/,1208431055,parent,2,(2021-08-25)1_firstSet_1.json
2,1308673361,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,2021-05-20 11:47:18.927,,9,iot_platf,support,1,/1201559436/1208431055/1308673361/,1208431055,parent,3,(2021-08-25)1_firstSet_1.json
3,1507656633,[Testing Only] this is enhancement only,Retest some function again.,,2020-03-13 10:16:18.703,,31,iot_platf,support,1,/1201559436/1208431055/1507656633/,1208431055,parent,4,(2021-08-25)1_firstSet_1.json
4,1507656638,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,2020-06-01 09:49:55.913,,19,iot_platf,support,1,/1201559436/1208431055/1507656638/,1208431055,parent,5,(2021-08-25)1_firstSet_1.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,22012641037,,"<div><span style=""font-size: 12.18px;"">Hello,&...",,2021-03-26 13:19:20.430,,11,iot_platf,support,1,/1201559436/1208431055/22012641037/,1208431055,parent,900,(2021-10-11)3_secondSet_1.json
900,22012645565,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-05-20 13:03:09.327,,11,iot_platf,support,1,/1201559436/1208431055/22012645565/,1208431055,parent,901,(2021-10-11)3_secondSet_1.json
901,22012704243,,<div>The schedule test suite allow for the use...,,2021-04-26 10:04:12.410,,9,iot_platf,support,1,/1201559436/1208431055/22012704243/,1208431055,parent,902,(2021-10-11)3_secondSet_1.json
902,22012765885,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-06-30 00:35:58.927,,14,iot_platf,support,1,/1201559436/1208431055/22012765885/,1208431055,parent,903,(2021-10-11)3_secondSet_1.json


In [None]:
#load only files that follow the agreed format, choose file by date
# date= "2021-08-25"
# df = data_loading(path,df=None,date = date)
# df

### Data Pre-processing

### a) Dataframe manipulation

In [5]:
def df_manipulation(df,how,keep,cols_tokeep=None,cols_todrop=None,impute_value=None,subset=None):
    """
    1) Column selection: Keep or drop columns in dataframe
    2) Data impute: Impute or drop NA rows 
    3) Data duplication cleaning: Drop all duplicates or drop all duplicates except for the first/last occurrence
    params:
    df [dataframe]: input dataframe 
    cols_tokeep [list/None]: list of columns to keep, if there is no list use None
    cols_todrop [list/None]: list of columns to drop, if there is no list use None
    impute_value [string/None]: value to be imputed (i.e "" for empty string). If no value to be imputed but there are 
                        rows to be dropped use None
    how[string]: Drop rows when we have at least one NA or all NA. Choose
                      # - "all": Drop row with all NA
                      # - "any": Drop row with at least one NA
                  
    subset[list/None]: Subset of columns for dropping NA and identifying duplicates, use None if no column to select
    keep[string/False]: Choose to drop all duplicates or drop all duplicates except for the first/last occurrence
                        # - "first" : Drop duplicates except for the first occurrence. 
                        # - "last" : Drop duplicates except for the last occurrence. 
                        # - False : Drop all duplicates.
    """
    
    print("Shape of df before manipulation:",df.shape)

    #Column selection - user can select columns or drop unwanted columns
    if cols_tokeep != None:
        df = df[cols_tokeep]
    if cols_todrop != None:
        df = df.drop(cols_todrop,axis=1)
    print("Shape of df after selecting columns:",df.shape)

    #---Data impute - user can impute or drop rows with NA,freq of null values before & after manipulation returned---#
    print("Number of null values in df:\n",df.isnull().sum())
  

    # impute NA values with user's choice of imputation value
    if impute_value != None:
        df = df.fillna(impute_value)
        print("Number of null values in df after NA imputation:\n",df.isnull().sum())
        
    else: # drop rows with NA values
        df= df.dropna(axis=0, how=how,subset=subset)
        print("Number of null values in df after dropping NA rows:\n",df.isnull().sum())
        print("Shape of df after dropping NA rows:",df.shape)

    #---------Data duplication cleaning--------#
    print("Number of duplicates in the df:", df.duplicated().sum())

    #drop duplicates
    df = df.drop_duplicates(subset=subset, keep=keep)

    print("Shape of df after manipulation:",df.shape)

    return df


In [6]:
# df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title"],cols_todrop=None,impute_value=None,subset=["title"])
df

Shape of df before manipulation: (5424, 15)
Shape of df after selecting columns: (5424, 1)
Number of null values in df:
 title    2712
dtype: int64
Number of null values in df after dropping NA rows:
 title    0
dtype: int64
Shape of df after dropping NA rows: (2712, 1)
Number of duplicates in the df: 1815
Shape of df after manipulation: (897, 1)


Unnamed: 0,title
0,provide method to update GIO fields from git r...
1,Test suite execution terminates before executi...
2,Cloning defects from another test cycle is not...
3,[Testing Only] this is enhancement only
4,[Testing Only] this is consultation only
...,...
899,Import GC Time Global Domain Artifact in GIO f...
900,[KPI_Metric] Phase-2: Extract kpi metric trend...
901,Ability to clone Schedule Test Suites from oth...
902,[KPI_Metric] Enhance KPI feature to plot graph...



### b) Text Normalization

### 2) Expand contractions 

In [None]:
import contractions

def word_contractions(text):
    """
    Expand word contractions (i.e. "isn't" to "is not")
    params:
    text[string]: input string 
    """
    return " ".join([contractions.fix(word) for word in text.split()])   


In [None]:
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["desc_cont"]=  [word_contractions(text) for text in df["description"]]
df["comments_cont"]=  [word_contractions(text) for text in df["comments"]]
df.head()

In [None]:
df.iloc[149,1]

In [None]:
df.iloc[149,4]

### 3) Convert all characters into lowercase 

In [None]:
def lowercase(text):
    """
    Convert all characters to lower case
    param:
    text[string]: input string 
    """
    return text.lower() if type(text) == str else text
    

In [None]:
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["desc_lower"]= [lowercase(text) for text in df["desc_cont"]]
df["comments_lower"]= [lowercase(text) for text in df["comments_cont"]]
df.head()

In [None]:
df = df[["title_lower","desc_lower","comments_lower"]]


### 4) Stemming/Lemmatization

### Stemming

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

def stem_words(text,stemmer_type):
    """
    Stemming words, 2 options available: Porter Stemmer or Lancaster Stemmer 
    params:
    text[string]: input string 
    stemmer_type[string]: input stemming method ("Porter" or "Lancaster")
    """
    if stemmer_type == "Porter":
        stemmer = PorterStemmer()
    if stemmer_type == "Lancaster":
        stemmer=LancasterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])
    

In [None]:
df1 = df.copy()

In [None]:
df1["title_stem_lan"] = [stem_words(text,stemmer_type = "Lancaster") for text in df1["title_lower"]]
df1["desc_stem_lan"] = [stem_words(text,stemmer_type = "Lancaster") for text in df1["desc_lower"]]
df1["comments_stem_lan"]= [stem_words(text,stemmer_type = "Lancaster") for text in df1["comments_lower"]]
df1.head()

In [None]:
df1["title_stem_por"] = [stem_words(text,stemmer_type = "Porter") for text in df1["title_lower"]]
df1["desc_stem_por"] = [stem_words(text,stemmer_type = "Porter") for text in df1["desc_lower"]]
df1["comments_stem_por"]= [stem_words(text,stemmer_type = "Porter") for text in df1["comments_lower"]]
df1.head()

### Lemmatization

In [None]:
df2 = df.copy()
df2.head()

In [None]:
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_words(column,lemma_type):
    """
    Lemmatize words, 2 options available: WordNetLemmatizer or Spacy 
    params:
    column[series]: input series/column to be lemmatized
    lemma_type[string]: input lemmatization method ("WordNet" or "Spacy")
    """
    if lemma_type == "WordNet":
        lemmatizer = WordNetLemmatizer()
        return column.apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))
    
    
    if lemma_type == "Spacy":
        nlp = spacy.load("en_core_web_sm")
        column = column.apply(lambda text: " ".join([w.lemma_ for w in nlp(text)]))
        #convert to lower case as spacy will convert pronouns to upper case
        column = column.apply(lambda text: text.lower() if type(text) == str else text )
        
        return column
        


In [None]:
df2["title_lemma_spacy"] = lemmatize_words(column= df2["title_rem"],lemma_type="Spacy")
df2["desc_lemma_spacy"] = lemmatize_words(column= df2["desc_rem"],lemma_type="Spacy")
df2["comments_lemma_spacy"] = lemmatize_words(column= df2["comments_rem"],lemma_type="Spacy")
df2

In [None]:
df2["title_lemma_word"] = lemmatize_words(column= df2["title_rem"],lemma_type="WordNet")
df2["desc_lemma_word"] = lemmatize_words(column= df2["desc_rem"],lemma_type="WordNet")
df2["comments_lemma_word"] = lemmatize_words(column= df2["comments_rem"],lemma_type="WordNet")
df2

### b) Noise filtering



### 1) Remove html tag and url

In [None]:
from bs4 import BeautifulSoup
import re
def remove_htmltag_url(text):
    """
    Remove html tag and url
    params:
    text [string]: input string
    
    """
    import pandas as pd
    pd.options.mode.chained_assignment = None 
    #remove html tag
    text = BeautifulSoup(text, 'html.parser').get_text(separator= " ",strip=True) 
    #remove url
    text_clean = re.sub('https?[://%]*\S+', ' ',text) 
    return text_clean 

In [None]:
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["desc_tag"]= [remove_htmltag_url(text) for text in df["desc_lower"]]
df["comments_tag"]= [remove_htmltag_url(text) for text in df["comments_lower"]]
df.head()

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[10,4]

### 3) Remove irrelevant characters, punctuation, special characters

In [None]:
df = df[["title_tag","desc_tag","comments_tag"]]

In [None]:
import re
def remove_irrchar_punc(text,char=None):
    """
    Remove irrelevant characters and punctuation
    params:
    
    text[string]: input string 
    characters[string]: input regex of characters to be removed
    """
    if char != None:
        #Remove special characters given by user
        text = re.sub(char, ' ',text) 
    
    # Remove utf-8 literals (i.e. \\xe2\\x80\\x8)
    text = re.sub(r'\\+x[\d\D][\d\D]', ' ',text) 
    
    #Remove special characters and punctuation
    text = re.sub('[^\w\s]', ' ',text) 
    text = re.sub(r'_', ' ',text) 
#     df = df.replace('[^\w\s]',' ', regex=True)
#     df = df.replace(r"_", " ", regex=True)
    
    return text


In [None]:
df.to_excel('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/'+'data.xlsx')

In [None]:
#char=None
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["desc_rem"]= [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["comments_rem"]= [remove_irrchar_punc(text,char=None) for text in df["comments_tag"]]
df.head()

In [None]:
df.iloc[10,1] #desc before rem

In [None]:
df.iloc[10,4] #desc rem

In [None]:
#special character removal added by user
char = '\++\d+'
df["title_rem"] = [remove_irrchar_punc(text,char=char) for text in df["title_tag"]]
df["desc_rem"]= [remove_irrchar_punc(text,char=char) for text in df["desc_tag"]]
df["comments_rem"]= [remove_irrchar_punc(text,char=char) for text in df["comments_tag"]]
df.head()

### 3) Remove numeric data

In [None]:
df = df[["title_rem","desc_rem","comments_rem"]]

In [None]:
def remove_num(text):
    """
    Remove numeric data
    params:
    text[string]: input string 
    
    """
    text = re.sub('\d+', ' ',text) 

    return text

In [None]:
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["desc_num"]= [remove_num(text) for text in df["desc_rem"]]
df["comments_num"]= [remove_num(text) for text in df["comments_rem"]]
df.head()

### 4) Remove multiple whitespaces

In [None]:
df = df[["title_num","desc_num","comments_num"]]

In [None]:
def remove_multwhitespace(text):
    """
    Remove multiple white spaces
    params:
    text[string]: input string 
    
    """
    text = re.sub(' +', ' ',text) 
    
    return text

In [None]:
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["desc_white"]= [remove_multwhitespace(text) for text in df["desc_num"]]
df["comments_white"]= [remove_multwhitespace(text) for text in df["comments_num"]]
df.head()

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[10,4]

### 4) Remove stopwords

In [None]:
# print(stopwords.words('english'))

In [None]:
df = df[["title_white","desc_white","comments_white"]]

In [None]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(text,extra_sw=None,remove_sw=None):
    """
    Removes English stopwords. Optional: user can add own stopwords or remove words from English stopwords  
    params:
    text[string]: input string
    extra_sw [list] (optional): list of words/phrase to be added to the stop words 
    remove_sw [list] (optional): list of words to be removed from the stop words 
    """
    all_stopwords = stopwords.words('english')
    
    #default list of stopwords
    if extra_sw == None and remove_sw==None:
        all_stopwords = all_stopwords
        
    # add more stopwords
    elif remove_sw == None:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        
    # remove stopwords from existing sw list
    elif extra_sw == None:
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
        
    # remove and add stopwords to existing sw list
    else:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
         
  
    for w in all_stopwords:
        pattern = r'\b'+w+r'\b'
        text = re.sub(pattern,' ', text)
                   
    return text 

In [None]:
print(stopwords.words('english'))

In [None]:
#list of words/phrase to be added to the stop words 
extra_sw = ['hsdes',"testing"]
#list of words/phrase to be removed from stop words
remove_sw = ["i","am"]
arg1 = extra_sw
arg2 = remove_sw

df["title_stop"]=  [remove_stopwords(text,extra_sw=arg1,remove_sw=arg2) for text in df["title_white"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=arg1,remove_sw=arg2) for text in df["desc_white"]]
df["comments_stop"]=  [remove_stopwords(text,extra_sw=arg1,remove_sw=arg2) for text in df["comments_white"]]
df.head()

### 5) Remove frequent words

In [None]:
df = df[["title_stop","desc_stop","comments_stop"]]

In [None]:
def remove_freqwords(column,n):
    """
    Remove n frequent words
    params:
    column[series]: input column to remove frequent words
    n [integer]: input number of frequent words to be removed
    """
    from collections import Counter
    cnt = Counter()
    
    for text in column.values:
        for word in text.split():
            cnt[word] += 1
           
    #custom function to remove the frequent words             
    FREQWORDS = set([w for (w, wc) in cnt.most_common(n)])
    
    print("Frequent words that are removed from column:", set([(w, wc) for (w, wc) in cnt.most_common(n)]))
    
    return column.apply(lambda text: " ".join([word for word in str(text).split() if word not in FREQWORDS]))



In [None]:
n=10
df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["comments_freq"] = remove_freqwords(df["comments_stop"],n)
df

In [None]:
df.iloc[2,0]

In [None]:
df.iloc[2,3]

### 6) Remove rare words

In [None]:
def remove_rarewords(column,n):
    """
    Remove n rare words
    params:
    column[series]: input column to remove rare words
    n [integer]: input number of rare words to be removed
    """
    from collections import Counter
    cnt = Counter()
    
    for text in column.values:
        for word in text.split():
            cnt[word] += 1
           
    #custom function to remove the rare words             
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n-1:-1]])
    
    print("Rare words that are removed from columns:", set([(w,wc) for (w, wc) in cnt.most_common()[:-n-1:-1]]))
        
    return column.apply(lambda text: " ".join([word for word in str(text).split() if word not in RAREWORDS]))


In [None]:
n=10
df["title_rare"] = remove_rarewords(df["title_stop"],n)
df["desc_rare"] = remove_rarewords(df["desc_stop"],n)
df["comments_rare"] = remove_rarewords(df["comments_stop"],n)
df

In [None]:
df.iloc[903,1] #converting is rare word

In [None]:
df.iloc[903,7]

### c) Custom tokenization

In [None]:
df= df[["title_stop","desc_stop","comments_stop"]]

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer
import re
#remove token method - seperate nltk and split functions 
def cust_tokenization(column,token_met,token_type,delim =None):
    """
    Custom tokenization, 2 options are available: split() or nltk 
    params:
    df [dataframe]: input dataframe 
    token_met["string"]: input tokenization method ("split" or "nltk")
    
    token_type["string"](use only if token_met= "nltk"): type of nltk tokenization
    a) token_type = "WordToken" tokenizes a string into a list of words
    b) token_type = "SentToken" tokenizes a string containing sentences into a list of sentences
    c) token_type = "WhiteSpaceToken" tokenizes a string on whitespace (space, tab, newline)
    d) token_type = "WordPunctTokenizer" tokenizes a string on punctuations
         
    delim["string"](use only if token_met = "split"): specify delimiter to separate strings,
    default delimiter (delim=None) is whitespace,  an alternate option for token_type = "WhiteSpaceToken"
    
    """
    if token_met == "split":
        if delim==None:
            print("Text is split by space") #default delimiter is space if not specified 

        else:
            print("Text is split by:", delim) #can accept one or more delimiter

        return column.apply(lambda text: text.split() if delim==None else text.split(delim))
    

    if token_met == "nltk":
    
        if token_type == "WordToken":
            tokenizer = word_tokenize
        if token_type == "SentToken":
            tokenizer = sent_tokenize
        if token_type == "WhiteSpaceToken":
            tokenizer = WhitespaceTokenizer().tokenize
        if token_type == "WordPunctTokenizer":
            tokenizer = WordPunctTokenizer().tokenize

        return column.apply(lambda text: tokenizer(text))
        
        
    

In [None]:
#use split
token_met="split"
token_type=None
delim = None

df["title_token"]= cust_tokenization(column=df["title_stop"],token_met=token_met,token_type=token_type,delim=delim)  
df["desc_token"]=  cust_tokenization(column=df["desc_stop"],token_met=token_met,token_type=token_type,delim=delim) 
df["comments_token"]= cust_tokenization(column=df["comments_stop"],token_met=token_met,token_type=token_type,delim=delim)
df.head()

In [None]:
#use nltk
token_met="nltk"
token_type="WordToken"
delim = None

df["title_token_nltk"]= cust_tokenization(column=df["title_stop"],token_met=token_met,token_type=token_type,delim=delim)  
df["desc_token_nltk"]=  cust_tokenization(column=df["desc_stop"],token_met=token_met,token_type=token_type,delim=delim) 
df["comments_token_nltk"]= cust_tokenization(column=df["comments_stop"],token_met=token_met,token_type=token_type,delim=delim)
df.head()

## d) Custom taxonomy

### i) Configurability for user to provide taxonomy mapping (to remove/remain)

In [None]:
df = df[["title_stop","desc_stop","comments_stop"]]

In [None]:
import re
#rename tax to taxo
def custom_tax(text,remove_tax,include_tax):
    """
    """
    for w in remove_tax:
        #row without any item from include_tax -> replace all remove_tax items with empty string
        if all(phrase not in text for phrase in include_tax): 
            pattern = r'\b'+w+r'\b'
            text = re.sub(pattern,' ', text) 
        #row with any item from include_tax -> only replace remove_tax item that is not in include_tax
        else: 
            if all(w not in phrase for phrase in include_tax):
                pattern = r'\b'+w+r'\b'
                text = re.sub(pattern,' ', text) 
    return text    

In [None]:
#list of words to remove
remove_tax = ["gio","fields","test"]
#list of words to maintain
include_tax = ["test suite execution","clone defects"]

df["title_tax"]=  [custom_tax(text,remove_tax,include_tax) for text in df["title_stop"]]
df["description_tax"]=  [custom_tax(text,remove_tax,include_tax) for text in df["desc_stop"]]
df["comments_tax"]=  [custom_tax(text,remove_tax,include_tax) for text in df["comments_stop"]]
df.head()


### ii)  Named Entity Recognition (Methodology to recommend potential taxonomy)

### Train custom NER model

In [None]:
df = df[["title_stop","desc_stop","comments_stop"]]

In [None]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import numpy as np
#user to understand requirement - examples 
def convert_spacy(DATA):
    """
    Convert  data into .spacy format
    DATA[]: Train/validation data to be converted to .spacy format
    """
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
        
    return db

    
def custom_ner(TRAIN_DATA,VAL_DATA,path):
    """
    Build and save custom NER model in given path. 
    
    """
    #convert train and validation data into .spacy format
    db_train = convert_spacy(TRAIN_DATA) 
    db_val = convert_spacy(VAL_DATA) 
    
    #save train and validation data in .spacy format in path
    db_train.to_disk(path +'train.spacy')
    db_val.to_disk(path +'val.spacy')
    
    print("Train and validation converted to .spacy format and saved")
    
    #autofill base_config file saved by user from spacy website
    !python -m spacy init fill-config base_config.cfg config.cfg
    
    #Model building and saving in path
    !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy
    
    print("Custom NER model built and saved!")
    
def check_ents(path,column):
    """
    Check entities after loading best model
    
    """
    #Load best model
    nlp = spacy.load(path + "/output/model-best/")     
    print("Best model loaded!")
    
    entities = []
    for text in column.tolist():
        doc = nlp(text)
        for ent in doc.ents:
            entities.append(ent.text+' - '+ent.label_)
    print(np.unique(np.array(entities)))        

def ner_wrapper(TRAIN_DATA,VAL_DATA,path,column,train_model):  
    """
    User can choose to train the spacy model or load spacy model
    params:
    TRAIN_DATA[NER format]: train data for model building
    VAL_DATA[NER format]: validation data for model building
    path[string]: input path to store model. Path has to be the same as base_config.cfg file downloaded from spacy
                  website and jupyter notebook.
    column[series]: column for entities to be checked
    train_model[True/False]: True if want to train model. False to load model (no training)
    """
    if train_model == True:
        custom_ner(TRAIN_DATA,VAL_DATA,path)
        check_ents(path,column)
        
    if train_model == False:
        check_ents(path,column)

In [None]:
#train data
TRAIN_DATA = [
["jchun wai kit is working on this to enable in new tcp", {"entities": [[0, 13, "NAME"]]}], 
["siewlita pending release", {"entities": [[0, 8, "NAME"]]}],
["hi lim chih quanx per our communication i still have one more question", {"entities": [[3, 17, "NAME"]]}],
["yeetheng the auto test trigger after build complete is working fine today", {"entities": [[0, 8, "NAME"]]}],
["hi jon here is the recipe link weichuan hi can you try to reproduce the issue once more", {"entities": [[3, 6, "NAME"],[31, 39, "NAME"]]}]
]

VAL_DATA = [
["wei chuan has updated me with the sample of test execution by automation manual chart", {"entities": [[0, 9, "NAME"]]}],
["subject gio logs and gio installation hi ajay jonathan i just noticed that star is directing all the logs to gio folder", {"entities": [[41, 45, "NAME"],[46, 55, "NAME"]]}],
["hi firesh final verdict in jenkins coming as fail even after all the triggered tests are passed", {"entities": [[3, 9, "NAME"],[27, 35, "NAME"]]}],
["wai kit below is the requirement needed from gio product defect detection", {"entities": [[0, 7, "NAME"]]}],
["just string field regards robert nowicki", {"entities": [[26, 40, "NAME"]]}]
]

#jupyter notebook and base_config.cfg path have to be the same
path = "C:/Users/nchong/"

#load and clean test data
df_test = pd.read_excel("C:/Users/nchong/test.xlsx",index_col=0)
df_test = df_manipulation(df_test,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)



In [None]:
ner_wrapper(TRAIN_DATA,VAL_DATA,path,column=df_test["comments"],train_model=True)

In [None]:
ner_wrapper(TRAIN_DATA,VAL_DATA,path,column=df_test["comments"],train_model=False)

### Feature extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def feature_extraction(column,ngram_range,ascending,fe_type):
    """
    Feature extraction methods - Bag of words or TF-IDF
    
    params:
    column [series]: column to select
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    ascending [True/False/None]: - None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)                               
    fe_type[string]: Feature extraction type: Choose "bagofwords" or "tfidf" method
    """
    
    if fe_type == "bagofwords":
        vec_type = CountVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0).astype(int)

    if fe_type == "tfidf":
        vec_type = TfidfVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0)
    
    if ascending != None:
            
        df = df.sort_values(by ='sum', axis = 1,ascending=ascending)
    
    
    return df

In [None]:
column = df.iloc[:3,0]
ngram_range = (1,1)
ascending = None
fe_type = "bagofwords"
feature_extraction(column,ngram_range,ascending,fe_type)

In [None]:
column = df.iloc[:3,0]
ngram_range = (1,1)
ascending = True
fe_type = "tfidf"
feature_extraction(column,ngram_range,ascending,fe_type)

In [None]:
#convert and save train/validation data as .spacy
# out_path = "C:/Users/nchong/"
# db_train = convert_spacy(TRAIN_DATA)
# db_train.to_disk(out_path +'train.spacy') # save the docbin object
# db_val = convert_spacy(VAL_DATA)
# db_val.to_disk(out_path +'val.spacy') # save the docbin object

In [None]:
# !python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy

In [None]:
#load best model
# nlp1 = spacy.load("C:/Users/nchong/output/model-best/") #load the best model

In [None]:
# doc = nlp1("waikitcx hi arisha please provide us the") # input sample text

# spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
# def show_ents(text):
#     doc= nlp1(text)
#     if doc.ents:
#         for ent in doc.ents:
#             return(ent.text+' - '+ent.label_)
#     else:
#         return('No named entities found.')

In [None]:
# import pandas as pd
# df_test = pd.read_excel("C:/Users/nchong/test.xlsx",index_col=0)
# df_test = df_manipulation(df_test,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)
