### Data Loading

In [1]:
#user input file path
path = 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/'

In [50]:
import os
os.listdir(path)

['(2021-08-25)1_firstSet_1.json',
 '(2021-08-25)3_secondSet_1.json',
 '(2021-10-11)3_secondSet_1.json',
 'data_3cols.csv']

In [3]:
def data_loading(path,start_date=None,stop_date=None):
    '''
    Load only files that follow agreed filename format, merge files as single dataframe.
    User can choose to 
    a) Load all json files following the agreed filename format
    b) Load only json files from specific dates by adding the start and stop dates (Note: Both start_date and
    stop_date must be used together)
    
    params:
    path [string]: path of the files, without filename
    
    start_date[None/string in YYYY-MM-DD format](optional,default is None): 
    User can choose to load files starting from start_date
    - None: no start_date is provided, all files are loaded
    - string in YYYY-MM-DD format: files starting from start_date will be loaded
    
    stop_date[None/string in YYYY-MM-DD format](optional,default is None): 
    User can choose to load files until stop_date
    - None: no stop_date is provided, all files are loaded
    - string in YYYY-MM-DD format: files until stop_date will be loaded
    '''
    from datetime import datetime,timedelta
    import pandas as pd
    import glob, os, json
    import re

    filenames = os.listdir(path)
    file_list=[]
    date_list = []
    df = pd.DataFrame()
    
    if start_date == None and stop_date == None :
        for file in filenames:
            # search agreed file format pattern in the filename

            pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"

            match = re.search(pattern,file)
                
            #if match is found
            if match:
                pattern = os.path.join(path, file) #join path with file name
                file_list.append(pattern) #list of json files that follow the agreed filename
            
        print("Files read:",file_list)                   
        for file in file_list:
            with open(file) as f:
                #flatten json into pd dataframe
                json_data = pd.json_normalize(json.loads(f.read()))
                json_data = pd.DataFrame(json_data)
                #label which file each row is from 
                json_data['file'] = file.rsplit("/", 1)[-1]

            df = df.append(json_data)              
                
    else:
        #convert start and stop string to datetime
        start = datetime.strptime(start_date, "%Y-%m-%d").date()
        stop = datetime.strptime(stop_date, "%Y-%m-%d").date()
    
        #iterate from start to stop dates by day and store dates in list
        while start <= stop:
            date_list.append(start)
            start = start + timedelta(days=1)  # increase day one by one

        #convert datetime objects to string
        string_list =[d.strftime("%Y-%m-%d") for d in date_list]
#         print(string_list)
        
        for file in filenames: 
            
            # search agreed file format pattern in the filename
            for date in string_list: 
                pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
        
                match = re.search(pattern,file)
                
                #if match is found
                if match:
                    pattern = os.path.join(path, file) #join path with file name
                    file_list.append(pattern) #list of json files that follow the agreed filename

        print("Files read:",file_list)     
        for file in file_list:
            with open(file) as f:
                #flatten json into pd dataframe
                json_data = pd.json_normalize(json.loads(f.read()))
                json_data = pd.DataFrame(json_data)
                #label which file each row is from 
                json_data['file'] = file.rsplit("/", 1)[-1]

            df = df.append(json_data)

    return df

In [32]:
df = data_loading(path,start_date = "2021-08-25",stop_date = "2021-08-25")
df

Files read: ['C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-08-25)1_firstSet_1.json', 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-08-25)3_secondSet_1.json']


Unnamed: 0,id,title,description,comments,updated_date,hierarchy_id,rev,tenant,subject,is_current,hierarchy_path,parent_id,record_type,row_num,file
0,1308651592,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",2021-07-21 12:30:31.387,,8,iot_platf,support,1,/1201559436/1208431055/1308651592/,1208431055,parent,1,(2021-08-25)1_firstSet_1.json
1,1308671310,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,2021-05-04 09:30:00.320,,11,iot_platf,support,1,/1201559436/1208431055/1308671310/,1208431055,parent,2,(2021-08-25)1_firstSet_1.json
2,1308673361,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,2021-05-20 11:47:18.927,,9,iot_platf,support,1,/1201559436/1208431055/1308673361/,1208431055,parent,3,(2021-08-25)1_firstSet_1.json
3,1507656633,[Testing Only] this is enhancement only,Retest some function again.,,2020-03-13 10:16:18.703,,31,iot_platf,support,1,/1201559436/1208431055/1507656633/,1208431055,parent,4,(2021-08-25)1_firstSet_1.json
4,1507656638,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,2020-06-01 09:49:55.913,,19,iot_platf,support,1,/1201559436/1208431055/1507656638/,1208431055,parent,5,(2021-08-25)1_firstSet_1.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,22012641037,,"<div><span style=""font-size: 12.18px;"">Hello,&...",,2021-03-26 13:19:20.430,,11,iot_platf,support,1,/1201559436/1208431055/22012641037/,1208431055,parent,900,(2021-08-25)3_secondSet_1.json
900,22012645565,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-05-20 13:03:09.327,,11,iot_platf,support,1,/1201559436/1208431055/22012645565/,1208431055,parent,901,(2021-08-25)3_secondSet_1.json
901,22012704243,,<div>The schedule test suite allow for the use...,,2021-04-26 10:04:12.410,,9,iot_platf,support,1,/1201559436/1208431055/22012704243/,1208431055,parent,902,(2021-08-25)3_secondSet_1.json
902,22012765885,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-06-30 00:35:58.927,,14,iot_platf,support,1,/1201559436/1208431055/22012765885/,1208431055,parent,903,(2021-08-25)3_secondSet_1.json


In [8]:
df = data_loading(path,start_date = None,stop_date = None)
df

Files read: ['C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-08-25)1_firstSet_1.json', 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-08-25)3_secondSet_1.json', 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-10-11)3_secondSet_1.json']


Unnamed: 0,id,title,description,comments,updated_date,hierarchy_id,rev,tenant,subject,is_current,hierarchy_path,parent_id,record_type,row_num,file
0,1308651592,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",2021-07-21 12:30:31.387,,8,iot_platf,support,1,/1201559436/1208431055/1308651592/,1208431055,parent,1,(2021-08-25)1_firstSet_1.json
1,1308671310,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,2021-05-04 09:30:00.320,,11,iot_platf,support,1,/1201559436/1208431055/1308671310/,1208431055,parent,2,(2021-08-25)1_firstSet_1.json
2,1308673361,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,2021-05-20 11:47:18.927,,9,iot_platf,support,1,/1201559436/1208431055/1308673361/,1208431055,parent,3,(2021-08-25)1_firstSet_1.json
3,1507656633,[Testing Only] this is enhancement only,Retest some function again.,,2020-03-13 10:16:18.703,,31,iot_platf,support,1,/1201559436/1208431055/1507656633/,1208431055,parent,4,(2021-08-25)1_firstSet_1.json
4,1507656638,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,2020-06-01 09:49:55.913,,19,iot_platf,support,1,/1201559436/1208431055/1507656638/,1208431055,parent,5,(2021-08-25)1_firstSet_1.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,22012641037,,"<div><span style=""font-size: 12.18px;"">Hello,&...",,2021-03-26 13:19:20.430,,11,iot_platf,support,1,/1201559436/1208431055/22012641037/,1208431055,parent,900,(2021-10-11)3_secondSet_1.json
900,22012645565,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-05-20 13:03:09.327,,11,iot_platf,support,1,/1201559436/1208431055/22012645565/,1208431055,parent,901,(2021-10-11)3_secondSet_1.json
901,22012704243,,<div>The schedule test suite allow for the use...,,2021-04-26 10:04:12.410,,9,iot_platf,support,1,/1201559436/1208431055/22012704243/,1208431055,parent,902,(2021-10-11)3_secondSet_1.json
902,22012765885,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-06-30 00:35:58.927,,14,iot_platf,support,1,/1201559436/1208431055/22012765885/,1208431055,parent,903,(2021-10-11)3_secondSet_1.json


### Data Pre-processing

### a) Dataframe manipulation

In [5]:
def df_manipulation(df,how,keep="first",cols_tokeep=None,cols_todrop=None,impute_value=None,subset=None):
    """
    1) Column selection: Keep or drop columns in dataframe
    2) Data impute: Impute or drop NA rows 
    3) Data duplication cleaning: Drop all duplicates or drop all duplicates except for the first/last occurrence
    
    params:
    df [dataframe]: input dataframe 
    how[string]: Drop rows when we have at least one NA or all NA. Choose
                      # - "all": Drop row with all NA
                      # - "any": Drop row with at least one NA
    keep[string/False]: Choose to drop all duplicates or drop all duplicates except for the first/last occurrence
                      # - None[DEFAULT] : Drop duplicates except for the first occurrence. 
                      # - "last" : Drop duplicates except for the last occurrence. 
                      # - False : Drop all duplicates.
    cols_tokeep [list/None][DEFAULT]: list of columns to keep, if there is no list use None 
    cols_todrop [list/None]: list of columns to drop, if there is no list use None 
    impute_value [string/None]: value to be imputed (i.e "" for empty string). If no value to be imputed but there are 
                        rows to be dropped use None
                  
    subset[list/None]: Subset of columns for dropping NA and identifying duplicates, use None if no column to select
   
    """
    
    print("Shape of df before manipulation:",df.shape)

    #Column selection - user can select columns or drop unwanted columns
    if cols_tokeep != None:
        df = df[cols_tokeep]
    if cols_todrop != None:
        df = df.drop(cols_todrop,axis=1)
    print("Shape of df after selecting columns:",df.shape)

    #---Data impute - user can impute or drop rows with NA,freq of null values before & after manipulation returned---#
    print("Number of null values in df:\n",df.isnull().sum())
  

    # impute NA values with user's choice of imputation value
    if impute_value != None:
        df = df.fillna(impute_value)
        print("Number of null values in df after NA imputation:\n",df.isnull().sum())
        
    else: # drop rows with NA values
        df= df.dropna(axis=0, how=how,subset=subset)
        print("Number of null values in df after dropping NA rows:\n",df.isnull().sum())
        print("Shape of df after dropping NA rows:",df.shape)

    #---------Data duplication cleaning--------#
    print("Number of duplicates in the df:", df.duplicated().sum())

    #drop duplicates
    if keep == None:
        keep = "first"
        
    df = df.drop_duplicates(subset=subset, keep=keep)

    print("Shape of df after manipulation:",df.shape)

    return df


In [9]:
# df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["id","description"],cols_todrop=None,impute_value=None,subset=None)
df.head()

Shape of df before manipulation: (2712, 15)
Shape of df after selecting columns: (2712, 2)
Number of null values in df:
 id             0
description    0
dtype: int64
Number of null values in df after dropping NA rows:
 id             0
description    0
dtype: int64
Shape of df after dropping NA rows: (2712, 2)
Number of duplicates in the df: 1808
Shape of df after manipulation: (904, 2)


Unnamed: 0,id,description
0,1308651592,Please provide a way to update GIO fields from...
1,1308671310,<p>Test suite execution finished before execut...
2,1308673361,<p>I am trying to clone defects from another t...
3,1507656633,Retest some function again.
4,1507656638,enter the support needed at here ...



### b) Text Normalization

### 2) Expand contractions 

In [10]:
import contractions

def word_contractions(text):
    """
    Expand word contractions (i.e. "isn't" to "is not")
    params:
    text[string]: input string 
    """
    return " ".join([contractions.fix(word) for word in text.split()])   


In [7]:
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["desc_cont"]=  [word_contractions(text) for text in df["description"]]
df["comments_cont"]=  [word_contractions(text) for text in df["comments"]]
df.head()

Unnamed: 0,title,description,comments,title_cont,desc_cont,comments_cont
0,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda Hi @Panceac, Cornel Eug..."
1,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala sys_tsdval@GL-IAF1-V-S04...
2,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala Observed that only imple...
3,[Testing Only] this is enhancement only,Retest some function again.,,[Testing Only] this is enhancement only,Retest some function again.,
4,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput HSDES testing. Please i...


In [None]:
df.iloc[149,1]

In [None]:
df.iloc[149,4]

### 3) Convert all characters into lowercase 

In [11]:
def lowercase(text):
    """
    Convert all characters to lower case
    param:
    text[string]: input string 
    """
    return text.lower() if type(text) == str else text
    

In [9]:
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["desc_lower"]= [lowercase(text) for text in df["desc_cont"]]
df["comments_lower"]= [lowercase(text) for text in df["comments_cont"]]
df.head()

Unnamed: 0,title,description,comments,title_cont,desc_cont,comments_cont,title_lower,desc_lower,comments_lower
0,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda Hi @Panceac, Cornel Eug...",provide method to update gio fields from git r...,please provide a way to update gio fields from...,"++++1562123662 fbakhda hi @panceac, cornel eug..."
1,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala sys_tsdval@GL-IAF1-V-S04...,test suite execution terminates before executi...,<p>test suite execution finished before execut...,++++1361513318 cmoala sys_tsdval@gl-iaf1-v-s04...
2,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala Observed that only imple...,cloning defects from another test cycle is not...,<p>i am trying to clone defects from another t...,++++1361514315 cmoala observed that only imple...
3,[Testing Only] this is enhancement only,Retest some function again.,,[Testing Only] this is enhancement only,Retest some function again.,,[testing only] this is enhancement only,retest some function again.,
4,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput HSDES testing. Please i...,[testing only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput hsdes testing. please i...


In [None]:
# df = df[["title_lower","desc_lower","comments_lower"]]


### 4) Stemming/Lemmatization

### Stemming

In [69]:
df= df[["title_rare","desc_rare","comments_rare"]]

In [12]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

def stem_words(text,stemmer_type=None):
    """
    Stemming words. Default option is Porter Stemmer, alternative option is Lancaster Stemmer 
    params:
    text[string]: input string 
    stemmer_type[None/string]: input stemming method 
                                - None for Porter Stemmer
                                - "Lancaster" for Lancaster Stemmer 
    """
    if stemmer_type == None:
        stemmer = PorterStemmer()
    if stemmer_type == "Lancaster":
        stemmer=LancasterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])
    

In [72]:
df1 = df.copy()

In [73]:
df1["title_stem_por"] = [stem_words(text,stemmer_type=None) for text in df1["title_rare"]]
df1["desc_stem_por"] = [stem_words(text,stemmer_type=None) for text in df1["desc_rare"]]
df1["comments_stem_por"]= [stem_words(text,stemmer_type=None) for text in df1["comments_rare"]]
df1.head()

Unnamed: 0,title_rare,desc_rare,comments_rare,title_stem_por,desc_stem_por,comments_stem_por
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,provid method updat field git repo automat,pleas provid way updat gio field git repo file...,fbakhda hi panceac cornel eugen littl backgrou...
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...,suit termin execut test,test suit execut finish execut test case error...,cmoala sy tsdval gl iaf v usr local gio client...
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...,clone defect anoth work,tri clone defect anoth test cycl get messag cl...,cmoala observ implement defect clone waikitc h...
3,testing,retest function,,test,retest function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...,test consult,enter support need,prajput hsde test pleas ignor chang siewlita h...


In [74]:
df1["title_stem_lan"] = [stem_words(text,stemmer_type = "Lancaster") for text in df1["title_rare"]]
df1["desc_stem_lan"] = [stem_words(text,stemmer_type = "Lancaster") for text in df1["desc_rare"]]
df1["comments_stem_lan"]= [stem_words(text,stemmer_type = "Lancaster") for text in df1["comments_rare"]]
df1.head()

Unnamed: 0,title_rare,desc_rare,comments_rare,title_stem_por,desc_stem_por,comments_stem_por,title_stem_lan,desc_stem_lan,comments_stem_lan
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,provid method updat field git repo automat,pleas provid way updat gio field git repo file...,fbakhda hi panceac cornel eugen littl backgrou...,provid method upd field git repo autom,pleas provid way upd gio field git repo fil me...,fbakhd hi panceac cornel eug littl background ...
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...,suit termin execut test,test suit execut finish execut test case error...,cmoala sy tsdval gl iaf v usr local gio client...,suit termin execut test,test suit execut fin execut test cas er observ...,cmoal sys tsdval gl iaf v usr loc gio cli log ...
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...,clone defect anoth work,tri clone defect anoth test cycl get messag cl...,cmoala observ implement defect clone waikitc h...,clon defect anoth work,try clon defect anoth test cyc get mess clon s...,cmoal observ impl defect clon waikitc hi crist...
3,testing,retest function,,test,retest function,,test,retest funct,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...,test consult,enter support need,prajput hsde test pleas ignor chang siewlita h...,test consult,ent support nee,prajput hsdes test pleas ign chang siewlit hsd...


### Lemmatization

In [75]:
df2 = df.copy()
df2.head()

Unnamed: 0,title_rare,desc_rare,comments_rare
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...
3,testing,retest function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...


In [13]:
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_words(column,lemma_type=None):
    """
    Lemmatize words: Default option is WordNetLemmatizer, alternative option is Spacy 
    params:
    column[series]: input series/column to be lemmatized
    lemma_type[None/string]: input lemmatization method
                            - None for WordNetLemmatizer
                            - "Spacy" for Spacy    
    """
    if lemma_type == None:
     
        lemmatizer = WordNetLemmatizer()
        return column.apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))
   
    
    if lemma_type == "Spacy":
        nlp = spacy.load("en_core_web_sm")
        column = column.apply(lambda text: " ".join([w.lemma_ for w in nlp(text)]))
        #convert to lower case as spacy will convert pronouns to upper case
        column = column.apply(lambda text: text.lower() if type(text) == str else text )
        
        return column
        


In [78]:
#Spacy
df2["title_lemma_spacy"] = lemmatize_words(column= df2["title_rare"],lemma_type="Spacy")
df2["desc_lemma_spacy"] = lemmatize_words(column= df2["desc_rare"],lemma_type="Spacy")
df2["comments_lemma_spacy"] = lemmatize_words(column= df2["comments_rare"],lemma_type="Spacy")
df2

Unnamed: 0,title_rare,desc_rare,comments_rare,title_lemma_spacy,desc_lemma_spacy,comments_lemma_spacy
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...,suite terminate execute test,test suite execution finish execute test case ...,cmoala sys tsdval gl iaf v usr local gio clien...
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...,cloning defect another working,try clone defect another test cycle get messag...,cmoala observe implement defect clone waikitc ...
3,testing,retest function,,testing,ret function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...,testing consultation,enter support need,prajput hsde test please ignore change siewlit...
...,...,...,...,...,...,...
899,,hello please import time global domain time kp...,,,hello please import time global domain time kp...,
900,,hi gio team thank providing kpi metric feature...,,,hi gio team thank provide kpi metric feature g...,
901,,schedule test suite allow user clone test suit...,,,schedule test suite allow user clone test suit...,
902,,hi gio team thank providing kpi feature plot k...,,,hi gio team thank provide kpi feature plot kpi...,


In [79]:
#WordNetLemmatizer
df2["title_lemma_word"] = lemmatize_words(column= df2["title_rare"],lemma_type=None)
df2["desc_lemma_word"] = lemmatize_words(column= df2["desc_rare"],lemma_type=None)
df2["comments_lemma_word"] = lemmatize_words(column= df2["comments_rare"],lemma_type=None)
df2

Unnamed: 0,title_rare,desc_rare,comments_rare,title_lemma_spacy,desc_lemma_spacy,comments_lemma_spacy,title_lemma_word,desc_lemma_word,comments_lemma_word
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,provide method update field git repo automatic...,please provide way update gio field git repo f...,fbakhda hi panceac cornel eugen little backgro...
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...,suite terminate execute test,test suite execution finish execute test case ...,cmoala sys tsdval gl iaf v usr local gio clien...,suite terminates executing test,test suite execution finished executing test c...,cmoala sys tsdval gl iaf v usr local gio clien...
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...,cloning defect another working,try clone defect another test cycle get messag...,cmoala observe implement defect clone waikitc ...,cloning defect another working,trying clone defect another test cycle get mes...,cmoala observed implemented defect cloned waik...
3,testing,retest function,,testing,ret function,,testing,retest function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...,testing consultation,enter support need,prajput hsde test please ignore change siewlit...,testing consultation,enter support needed,prajput hsdes testing please ignore change sie...
...,...,...,...,...,...,...,...,...,...
899,,hello please import time global domain time kp...,,,hello please import time global domain time kp...,,,hello please import time global domain time kp...,
900,,hi gio team thank providing kpi metric feature...,,,hi gio team thank provide kpi metric feature g...,,,hi gio team thank providing kpi metric feature...,
901,,schedule test suite allow user clone test suit...,,,schedule test suite allow user clone test suit...,,,schedule test suite allow user clone test suit...,
902,,hi gio team thank providing kpi feature plot k...,,,hi gio team thank provide kpi feature plot kpi...,,,hi gio team thank providing kpi feature plot k...,


### b) Noise filtering



### 1) Remove html tag and url

In [14]:
from bs4 import BeautifulSoup
import re
def remove_htmltag_url(text):
    """
    Remove html tag and url
    params:
    text [string]: input string
    
    """
    import pandas as pd
    pd.options.mode.chained_assignment = None 
    #remove html tag
    text = BeautifulSoup(text, 'html.parser').get_text(separator= " ",strip=True) 
    #remove url
    text_clean = re.sub('https?[://%]*\S+', ' ',text) 
    return text_clean 

In [11]:
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["desc_tag"]= [remove_htmltag_url(text) for text in df["desc_lower"]]
df["comments_tag"]= [remove_htmltag_url(text) for text in df["comments_lower"]]
df.head()

Unnamed: 0,title,description,comments,title_cont,desc_cont,comments_cont,title_lower,desc_lower,comments_lower,title_tag,desc_tag,comments_tag
0,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda Hi @Panceac, Cornel Eug...",provide method to update gio fields from git r...,please provide a way to update gio fields from...,"++++1562123662 fbakhda hi @panceac, cornel eug...",provide method to update gio fields from git r...,please provide a way to update gio fields from...,"++++1562123662 fbakhda hi @panceac, cornel eug..."
1,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala sys_tsdval@GL-IAF1-V-S04...,test suite execution terminates before executi...,<p>test suite execution finished before execut...,++++1361513318 cmoala sys_tsdval@gl-iaf1-v-s04...,test suite execution terminates before executi...,test suite execution finished before executing...,++++1361513318 cmoala sys_tsdval@gl-iaf1-v-s04...
2,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala Observed that only imple...,cloning defects from another test cycle is not...,<p>i am trying to clone defects from another t...,++++1361514315 cmoala observed that only imple...,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,++++1361514315 cmoala observed that only imple...
3,[Testing Only] this is enhancement only,Retest some function again.,,[Testing Only] this is enhancement only,Retest some function again.,,[testing only] this is enhancement only,retest some function again.,,[testing only] this is enhancement only,retest some function again.,
4,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput HSDES testing. Please i...,[testing only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput hsdes testing. please i...,[testing only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput hsdes testing. please i...


In [None]:
df.iloc[10,1]

In [None]:
df.iloc[10,4]

### 3) Remove irrelevant characters, punctuation, special characters

In [12]:
df = df[["title_tag","desc_tag","comments_tag"]]

In [15]:
import re
def remove_irrchar_punc(text,char=None):
    """
    Remove irrelevant characters and punctuation. Optional: User can specify special characters to be removed in regex
    format.    
    params:    
    text[string]: input string 
    characters[string]: input regex of characters to be removed
    """
    if char != None:
        #Remove special characters given by user
        text = re.sub(char, ' ',text) 
    
    # Remove utf-8 literals (i.e. \\xe2\\x80\\x8)
    text = re.sub(r'\\+x[\d\D][\d\D]', ' ',text) 
    
    #Remove special characters and punctuation
    text = re.sub('[^\w\s]', ' ',text) 
    text = re.sub(r'_', ' ',text) 
   
    return text


In [14]:
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["desc_rem"]= [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["comments_rem"]= [remove_irrchar_punc(text,char=None) for text in df["comments_tag"]]
df.head()

Unnamed: 0,title_tag,desc_tag,comments_tag,title_rem,desc_rem,comments_rem
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...,"++++1562123662 fbakhda hi @panceac, cornel eug...",provide method to update gio fields from git r...,please provide a way to update gio fields from...,1562123662 fbakhda hi panceac cornel eug...
1,test suite execution terminates before executi...,test suite execution finished before executing...,++++1361513318 cmoala sys_tsdval@gl-iaf1-v-s04...,test suite execution terminates before executi...,test suite execution finished before executing...,1361513318 cmoala sys tsdval gl iaf1 v s04...
2,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,++++1361514315 cmoala observed that only imple...,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,1361514315 cmoala observed that only imple...
3,[testing only] this is enhancement only,retest some function again.,,testing only this is enhancement only,retest some function again,
4,[testing only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput hsdes testing. please i...,testing only this is consultation only,enter the support needed at here,1661488832 prajput hsdes testing please i...


In [None]:
df.iloc[10,1] #desc before rem

In [None]:
df.iloc[10,4] #desc rem

In [None]:
#special character removal added by user
char = '\++\d+'
df["title_rem"] = [remove_irrchar_punc(text,char=char) for text in df["title_tag"]]
df["desc_rem"]= [remove_irrchar_punc(text,char=char) for text in df["desc_tag"]]
df["comments_rem"]= [remove_irrchar_punc(text,char=char) for text in df["comments_tag"]]
df.head()

### 3) Remove numeric data

In [15]:
df = df[["title_rem","desc_rem","comments_rem"]]

In [16]:
def remove_num(text):
    """
    Remove numeric data
    params:
    text[string]: input string 
    
    """
    text = re.sub('\d+', ' ',text) 

    return text

In [17]:
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["desc_num"]= [remove_num(text) for text in df["desc_rem"]]
df["comments_num"]= [remove_num(text) for text in df["comments_rem"]]
df.head()

Unnamed: 0,title_rem,desc_rem,comments_rem,title_num,desc_num,comments_num
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...,1562123662 fbakhda hi panceac cornel eug...,provide method to update gio fields from git r...,please provide a way to update gio fields from...,fbakhda hi panceac cornel eugen can ...
1,test suite execution terminates before executi...,test suite execution finished before executing...,1361513318 cmoala sys tsdval gl iaf1 v s04...,test suite execution terminates before executi...,test suite execution finished before executing...,cmoala sys tsdval gl iaf v s usr lo...
2,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,1361514315 cmoala observed that only imple...,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,cmoala observed that only implemented de...
3,testing only this is enhancement only,retest some function again,,testing only this is enhancement only,retest some function again,
4,testing only this is consultation only,enter the support needed at here,1661488832 prajput hsdes testing please i...,testing only this is consultation only,enter the support needed at here,prajput hsdes testing please ignore any...


### 4) Remove multiple whitespaces

In [18]:
df = df[["title_num","desc_num","comments_num"]]

In [27]:
def remove_multwhitespace(text):
    """
    Remove multiple white spaces
    params:
    text[string]: input string 
    
    """
    text = re.sub(' +', ' ',text) 
    
    return text

In [20]:
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["desc_white"]= [remove_multwhitespace(text) for text in df["desc_num"]]
df["comments_white"]= [remove_multwhitespace(text) for text in df["comments_num"]]
df.head()

Unnamed: 0,title_num,desc_num,comments_num,title_white,desc_white,comments_white
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...,fbakhda hi panceac cornel eugen can ...,provide method to update gio fields from git r...,please provide a way to update gio fields from...,fbakhda hi panceac cornel eugen can i a littl...
1,test suite execution terminates before executi...,test suite execution finished before executing...,cmoala sys tsdval gl iaf v s usr lo...,test suite execution terminates before executi...,test suite execution finished before executing...,cmoala sys tsdval gl iaf v s usr local gio cl...
2,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,cmoala observed that only implemented de...,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,cmoala observed that only implemented defects...
3,testing only this is enhancement only,retest some function again,,testing only this is enhancement only,retest some function again,
4,testing only this is consultation only,enter the support needed at here,prajput hsdes testing please ignore any...,testing only this is consultation only,enter the support needed at here,prajput hsdes testing please ignore any chang...


In [None]:
df.iloc[10,1]

In [None]:
df.iloc[10,4]

### 4) Remove stopwords

In [21]:
df = df[["title_white","desc_white","comments_white"]]

In [18]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(text,extra_sw=None,remove_sw=None):
    """
    Removes English stopwords. Optional: user can add own stopwords or remove words from English stopwords  
    params:
    text[string]: input string
    extra_sw [list] (optional): list of words/phrase to be added to the stop words 
    remove_sw [list] (optional): list of words to be removed from the stop words 
    """
    all_stopwords = stopwords.words('english')
    
    #default list of stopwords
    if extra_sw == None and remove_sw==None:
        all_stopwords = all_stopwords
        
    # add more stopwords
    elif remove_sw == None:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        
    # remove stopwords from existing sw list
    elif extra_sw == None:
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
        
    # remove and add stopwords to existing sw list
    else:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
         
  
    for w in all_stopwords:
        pattern = r'\b'+w+r'\b'
        text = re.sub(pattern,' ', text)
                   
    return text 

In [23]:
#list of words/phrase to be added to the stop words 
# extra_sw = ['hsdes',"testing"]
#list of words/phrase to be removed from stop words
# remove_sw = ["i","am"]

df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
df["comments_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["comments_white"]]
df.head()

Unnamed: 0,title_white,desc_white,comments_white,title_stop,desc_stop,comments_stop
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...,fbakhda hi panceac cornel eugen can i a littl...,provide method update gio fields git repo ...,please provide way update gio fields git...,fbakhda hi panceac cornel eugen little ...
1,test suite execution terminates before executi...,test suite execution finished before executing...,cmoala sys tsdval gl iaf v s usr local gio cl...,test suite execution terminates executing ...,test suite execution finished executing te...,cmoala sys tsdval gl iaf v usr local gio cl...
2,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,cmoala observed that only implemented defects...,cloning defects another test cycle working,trying clone defects another test cycl...,cmoala observed implemented defects c...
3,testing only this is enhancement only,retest some function again,,testing enhancement,retest function,
4,testing only this is consultation only,enter the support needed at here,prajput hsdes testing please ignore any chang...,testing consultation,enter support needed,prajput hsdes testing please ignore changes...


### 5) Remove frequent words

In [24]:
df = df[["title_stop","desc_stop","comments_stop"]]

In [19]:
def remove_freqwords(column,n):
    """
    Remove n frequent words
    params:
    column[series]: input column to remove frequent words
    n [integer]: input number of frequent words to be removed
    """
    from collections import Counter
    cnt = Counter()
    
    for text in column.values:
        for word in text.split():
            cnt[word] += 1
           
    #custom function to remove the frequent words             
    FREQWORDS = set([w for (w, wc) in cnt.most_common(n)])
    
    print("Frequent words that are removed from column:", set([(w, wc) for (w, wc) in cnt.most_common(n)]))
    
    return column.apply(lambda text: " ".join([word for word in str(text).split() if word not in FREQWORDS]))



In [26]:
n=10
df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["comments_freq"] = remove_freqwords(df["comments_stop"],n)
df.head()

Frequent words that are removed from column: {('cycle', 99), ('case', 77), ('test', 454), ('project', 93), ('enhancement', 98), ('request', 90), ('execution', 94), ('add', 88), ('cases', 80), ('gio', 502)}
Frequent words that are removed from column: {('execution', 510), ('cases', 580), ('pass', 594), ('project', 492), ('result', 492), ('case', 668), ('cycle', 840), ('gio', 2418), ('link', 460), ('test', 3264)}
Frequent words that are removed from column: {('please', 714), ('client', 345), ('gio', 958), ('issue', 697), ('close', 306), ('test', 851), ('regards', 295), ('hi', 1480), ('thanks', 433), ('jchun', 779)}


Unnamed: 0,title_stop,desc_stop,comments_stop,title_freq,desc_freq,comments_freq
0,provide method update gio fields git repo ...,please provide way update gio fields git...,fbakhda hi panceac cornel eugen little ...,provide method update fields git repo automati...,please provide way update fields git repo file...,fbakhda panceac cornel eugen little background...
1,test suite execution terminates executing ...,test suite execution finished executing te...,cmoala sys tsdval gl iaf v usr local gio cl...,suite terminates executing tests,suite finished executing tests error observed ...,cmoala sys tsdval gl iaf v usr local logs gvlo...
2,cloning defects another test cycle working,trying clone defects another test cycl...,cmoala observed implemented defects c...,cloning defects another working,trying clone defects another get message cloni...,cmoala observed implemented defects cloned wai...
3,testing enhancement,retest function,,testing,retest function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes...,testing consultation,enter support needed,prajput hsdes testing ignore changes siewlita ...


In [None]:
df.iloc[2,0]

In [None]:
df.iloc[2,3]

### 6) Remove rare words

In [20]:
def remove_rarewords(column,n):
    """
    Remove n rare words
    params:
    column[series]: input column to remove rare words
    n [integer]: input number of rare words to be removed
    """
    from collections import Counter
    cnt = Counter()
    
    for text in column.values:
        for word in text.split():
            cnt[word] += 1
           
    #custom function to remove the rare words             
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n-1:-1]])
    
    print("Rare words that are removed from column:", set([(w,wc) for (w, wc) in cnt.most_common()[:-n-1:-1]]))
        
    return column.apply(lambda text: " ".join([word for word in str(text).split() if word not in RAREWORDS]))


In [28]:
n=10
df["title_rare"] = remove_rarewords(df["title_freq"],n)
df["desc_rare"] = remove_rarewords(df["desc_stop"],n)
df["comments_rare"] = remove_rarewords(df["comments_stop"],n)
df.head()

Rare words that are removed from columns: {('outside', 2), ('mst', 2), ('relese', 2), ('suc', 2), ('plenty', 2), ('converting', 2), ('opy', 2), ('fit', 2), ('pulling', 2), ('traceability', 2)}
Rare words that are removed from columns: {('super', 1), ('formats', 1), ('piie', 1), ('enhancment', 1), ('sqba', 1), ('katheine', 1), ('sq', 1), ('metrics', 1), ('hopefully', 1), ('misunderstand', 1)}


Unnamed: 0,title_stop,desc_stop,comments_stop,title_freq,desc_freq,comments_freq,title_rare,desc_rare,comments_rare
0,provide method update gio fields git repo ...,please provide way update gio fields git...,fbakhda hi panceac cornel eugen little ...,provide method update fields git repo automati...,please provide way update fields git repo file...,fbakhda panceac cornel eugen little background...,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...
1,test suite execution terminates executing ...,test suite execution finished executing te...,cmoala sys tsdval gl iaf v usr local gio cl...,suite terminates executing tests,suite finished executing tests error observed ...,cmoala sys tsdval gl iaf v usr local logs gvlo...,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...
2,cloning defects another test cycle working,trying clone defects another test cycl...,cmoala observed implemented defects c...,cloning defects another working,trying clone defects another get message cloni...,cmoala observed implemented defects cloned wai...,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...
3,testing enhancement,retest function,,testing,retest function,,testing,retest function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes...,testing consultation,enter support needed,prajput hsdes testing ignore changes siewlita ...,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...


In [None]:
df.iloc[903,1] #converting is rare word

In [None]:
df.iloc[903,7]

### c) Custom tokenization

In [16]:
def cust_tokenization_split(column,delim =None):
    """
    Custom tokenization using split() 
    params:
    column[series]: input column           
    delim[None/string],default delimiter (delim=None) is whitespace: specify delimiter to separate strings
                        - None: delimiter is white space
                        - string: delimiter is the string specified       
    """
    
    if delim==None:
        print("Text is split by whitespace") #default delimiter is space if not specified 

    else:
        print("Text is split by:", delim) #can accept one or more delimiter

    return column.apply(lambda text: text.split() if delim==None else text.split(delim))


In [88]:
#use split
df["title_token"]= cust_tokenization_split(column = df["title_rare"],delim= None) 
df["desc_token"]= cust_tokenization_split(column = df["desc_rare"],delim= None)
df["comments_token"]= cust_tokenization_split(column = df["comments_rare"],delim= None)
df

Text is split by whitespace
Text is split by whitespace
Text is split by whitespace


Unnamed: 0,title_rare,desc_rare,comments_rare,title_token,desc_token,comments_token
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,"[provide, method, update, fields, git, repo, a...","[please, provide, way, update, gio, fields, gi...","[fbakhda, hi, panceac, cornel, eugen, little, ..."
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...,"[suite, terminates, executing, tests]","[test, suite, execution, finished, executing, ...","[cmoala, sys, tsdval, gl, iaf, v, usr, local, ..."
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...,"[cloning, defects, another, working]","[trying, clone, defects, another, test, cycle,...","[cmoala, observed, implemented, defects, clone..."
3,testing,retest function,,[testing],"[retest, function]",[]
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...,"[testing, consultation]","[enter, support, needed]","[prajput, hsdes, testing, please, ignore, chan..."
...,...,...,...,...,...,...
899,,hello please import time global domain time kp...,,[],"[hello, please, import, time, global, domain, ...",[]
900,,hi gio team thank providing kpi metric feature...,,[],"[hi, gio, team, thank, providing, kpi, metric,...",[]
901,,schedule test suite allow user clone test suit...,,[],"[schedule, test, suite, allow, user, clone, te...",[]
902,,hi gio team thank providing kpi feature plot k...,,[],"[hi, gio, team, thank, providing, kpi, feature...",[]


In [82]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer

def cust_tokenization_nltk(column,token_type):
    """
    Custom tokenization using NLTK 
    params:
    column[series]: input column 
    token_type["string"]: type of nltk tokenization
    a) token_type = "WordToken" tokenizes a string into a list of words
    b) token_type = "SentToken" tokenizes a string containing sentences into a list of sentences
    c) token_type = "WhiteSpaceToken" tokenizes a string on whitespace (space, tab, newline)
    d) token_type = "WordPunctTokenizer" tokenizes a string on punctuations
    """
    if token_type == "WordToken":
        tokenizer = word_tokenize
    if token_type == "SentToken":
        tokenizer = sent_tokenize
    if token_type == "WhiteSpaceToken":
        tokenizer = WhitespaceTokenizer().tokenize
    if token_type == "WordPunctTokenizer":
        tokenizer = WordPunctTokenizer().tokenize

    return column.apply(lambda text: tokenizer(text))

In [90]:
#use nltk
df["title_token"]= cust_tokenization_nltk(column = df["title_rare"],token_type= "WordToken") 
df["desc_token"]= cust_tokenization_nltk(column = df["desc_rare"],token_type="WordToken")
df["comments_token"]= cust_tokenization_nltk(column = df["comments_rare"],token_type= "WordToken")
df

Unnamed: 0,title_rare,desc_rare,comments_rare,title_token,desc_token,comments_token
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,"[provide, method, update, fields, git, repo, a...","[please, provide, way, update, gio, fields, gi...","[fbakhda, hi, panceac, cornel, eugen, little, ..."
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...,"[suite, terminates, executing, tests]","[test, suite, execution, finished, executing, ...","[cmoala, sys, tsdval, gl, iaf, v, usr, local, ..."
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...,"[cloning, defects, another, working]","[trying, clone, defects, another, test, cycle,...","[cmoala, observed, implemented, defects, clone..."
3,testing,retest function,,[testing],"[retest, function]",[]
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...,"[testing, consultation]","[enter, support, needed]","[prajput, hsdes, testing, please, ignore, chan..."
...,...,...,...,...,...,...
899,,hello please import time global domain time kp...,,[],"[hello, please, import, time, global, domain, ...",[]
900,,hi gio team thank providing kpi metric feature...,,[],"[hi, gio, team, thank, providing, kpi, metric,...",[]
901,,schedule test suite allow user clone test suit...,,[],"[schedule, test, suite, allow, user, clone, te...",[]
902,,hi gio team thank providing kpi feature plot k...,,[],"[hi, gio, team, thank, providing, kpi, feature...",[]


## d) Custom taxonomy

### i) Configurability for user to provide taxonomy mapping (to remove/remain)

In [30]:
df = df[["title_rare","desc_rare","comments_rare"]]
df.head()

Unnamed: 0,title_rare,desc_rare,comments_rare
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...
3,testing,retest function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...


In [21]:
import re

def custom_taxo(text,remove_taxo,include_taxo):
    """
    User provides taxonomy to be removed or remained in the text
    params:
    text[string]: text to remove/maintain the taxonomy
    remove_taxo[list]: list of taxonomy to be removed from text
    include_taxo[list]: list of taxonomy to be maintained in text
    """
    for w in remove_taxo:
        #row without any item from include_taxo -> replace all remove_taxo items with empty string
        if all(phrase not in text for phrase in include_taxo): 
            pattern = r'\b'+w+r'\b'
            text = re.sub(pattern,' ', text) 
        #row with any item from include_taxo -> only replace remove_taxo item that is not in include_taxo
        else: 
            if all(w not in phrase for phrase in include_taxo):
                pattern = r'\b'+w+r'\b'
                text = re.sub(pattern,' ', text) 
    return text    

In [40]:
#list of words to remove
remove_taxo = ["gio","fields","test"]
#list of words to maintain
include_taxo = ["test suite execution","kpi metric"]

df["title_taxo"]=  [custom_taxo(text,remove_taxo,include_taxo) for text in df["title_rare"]]
df["description_taxo"]=  [custom_taxo(text,remove_taxo,include_taxo) for text in df["desc_rare"]]
df["comments_taxo"]=  [custom_taxo(text,remove_taxo,include_taxo) for text in df["comments_rare"]]
df.head()


Unnamed: 0,title_rare,desc_rare,comments_rare,title_taxo,description_taxo,comments_taxo
0,provide method update fields git repo automati...,please provide way update gio fields git repo ...,fbakhda hi panceac cornel eugen little backgro...,provide method update git repo automatically,please provide way update git repo files m...,fbakhda hi panceac cornel eugen little backgro...
1,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local gio clien...,suite terminates executing tests,test suite execution finished executing tests ...,cmoala sys tsdval gl iaf v usr local client ...
2,cloning defects another working,trying clone defects another test cycle get me...,cmoala observed implemented defects cloned wai...,cloning defects another working,trying clone defects another cycle get messa...,cmoala observed implemented defects cloned wai...
3,testing,retest function,,testing,retest function,
4,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...,testing consultation,enter support needed,prajput hsdes testing please ignore changes si...


### ii) Custom Named Entity Recognition (Methodology to recommend potential taxonomy)
1) User to split text data into train, validation, test

2) User to create custom entity data for the train and validation

3) User to get base_config.cfg file from Spacy website and save in same path as jupyter notebook

4) Function will 

    i) convert data into .spacy format 
    
    ii) build/save NER model in given path or load previously built NER model
    
    iii) Label entities in test data to recommend potential taxonomy to user


In [None]:
# df = df[["title_stop","desc_stop","comments_stop"]]

In [34]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import numpy as np

def convert_spacy(DATA):
    """
    Convert  data into .spacy format
    DATA[]: Train/validation data to be converted to .spacy format
    """
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
        
    return db

    
def custom_ner(TRAIN_DATA,VAL_DATA,path):
    """
    Build and save custom NER model in given path. 
    
    """
    #convert train and validation data into .spacy format
    db_train = convert_spacy(TRAIN_DATA) 
    db_val = convert_spacy(VAL_DATA) 
    
    #save train and validation data in .spacy format in path
    db_train.to_disk(path +'train.spacy')
    db_val.to_disk(path +'val.spacy')
    
    print("Train and validation converted to .spacy format and saved")
    
    #autofill base_config file saved by user from spacy website
    !python -m spacy init fill-config base_config.cfg config.cfg
    
    #Model building and saving in path
    !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy
    
    print("Custom NER model built and saved!")
    
def check_ents(path,column):
    """
    Check entities after loading best model
    
    """
    #Load best model
    nlp = spacy.load(path + "/output/model-best/")     
    print("Best model loaded!")
    
    entities = []
    for text in column.tolist():
        doc = nlp(text)
        for ent in doc.ents:
            entities.append(ent.text+' - '+ent.label_)
    print(np.unique(np.array(entities)))        

def ner_wrapper(TRAIN_DATA,VAL_DATA,path,column,train_model):  
    """
    User can choose to train the spacy model or load spacy model
    params:
    TRAIN_DATA[NER format]: train data for model building
    VAL_DATA[NER format]: validation data for model building
    path[string]: input path to store model. Path has to be the same as base_config.cfg file downloaded from spacy
                  website and jupyter notebook.
    column[series]: column for entities to be checked
    train_model[True/False]: True if want to train model. False to load model (no training)
    """
    if train_model == True:
        custom_ner(TRAIN_DATA,VAL_DATA,path)
        check_ents(path,column)
        
    if train_model == False:
        check_ents(path,column)

In [35]:
#custom entity data for the train and validation
TRAIN_DATA = [
["jchun wai kit is working on this to enable in new tcp", {"entities": [[0, 13, "NAME"]]}], 
["siewlita pending release", {"entities": [[0, 8, "NAME"]]}],
["hi lim chih quanx per our communication i still have one more question", {"entities": [[3, 17, "NAME"]]}],
["yeetheng the auto test trigger after build complete is working fine today", {"entities": [[0, 8, "NAME"]]}],
["hi jon here is the recipe link weichuan hi can you try to reproduce the issue once more", {"entities": [[3, 6, "NAME"],[31, 39, "NAME"]]}]
]

VAL_DATA = [
["wei chuan has updated me with the sample of test execution by automation manual chart", {"entities": [[0, 9, "NAME"]]}],
["subject gio logs and gio installation hi ajay jonathan i just noticed that star is directing all the logs to gio folder", {"entities": [[41, 45, "NAME"],[46, 55, "NAME"]]}],
["hi firesh final verdict in jenkins coming as fail even after all the triggered tests are passed", {"entities": [[3, 9, "NAME"],[27, 35, "NAME"]]}],
["wai kit below is the requirement needed from gio product defect detection", {"entities": [[0, 7, "NAME"]]}],
["just string field regards robert nowicki", {"entities": [[26, 40, "NAME"]]}]
]

#jupyter notebook and base_config.cfg path have to be the same
path = "C:/Users/nchong/"

#load and clean test data
df_test = pd.read_excel("C:/Users/nchong/test.xlsx",index_col=0)
df_test = df_manipulation(df_test,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)



Shape of df before manipulation: (600, 3)
Shape of df after selecting columns: (600, 3)
Number of null values in df:
 title          297
description      2
comments       335
dtype: int64
Number of null values in df after NA imputation:
 title          0
description    0
comments       0
dtype: int64
Number of duplicates in the df: 0
Shape of df after manipulation: (600, 3)


In [36]:
ner_wrapper(TRAIN_DATA,VAL_DATA,path,column=df_test["comments"],train_model=True)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 709.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1260.31it/s]

Train and validation converted to .spacy format and saved





[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


2021-11-02 09:23:13.403538: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-11-02 09:23:13.403596: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     25.83   10.81    5.97   57.14    0.11
200     200         48.06    711.39   44.44  100.00   28.57    0.44
400     400          0.00      0.00   44.44  100.00   28.57    0.44
600     600          0.00      0.00   44.44  100.00   28.57    0.44
800     800          0.00      0.00   44.44  100.00   28.57    0.44
1000    1000          0.00      0.00   44.44  100.00   28.57    0.44
1200    1200          0.00      0.00   44.44  100.00   28.57    0.44
1400    1400          0.00      0.00   44.44  100.00   28.57    0.44
1600    1600          0.00      0.00   44.44  100.00   28.57    0.44
1800    1800          0.00      0.00   44.44  100.00   28.57    0.44
Custom NER model built and s

2021-11-02 09:23:16.376325: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-11-02 09:23:16.376380: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2021-11-02 09:23:18,486] [INFO] Set up nlp object from config
[2021-11-02 09:23:18,486] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-11-02 09:23:18,502] [INFO] Created vocabulary
[2021-11-02 09:23:18,502] [INFO] Finished initializing nlp object
[2021-11-02 09:23:18,596] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


Best model loaded!
['adrian - NAME' 'ajay - NAME' 'ajaykmal - NAME' 'all - NAME'
 'already - NAME' 'aparna - NAME' 'arasu - NAME' 'arisha - NAME'
 'armit - NAME' 'aslam - NAME' 'athira - NAME' 'aurobinda - NAME'
 'azfar - NAME' 'beik - NAME' 'chihquax - NAME' 'chows - NAME'
 'chung - NAME' 'clement - NAME' 'cristian - NAME' 'cristina - NAME'
 'dalwynkx - NAME' 'daryl - NAME' 'darylhe - NAME' 'deeksha - NAME'
 'deepika - NAME' 'deigo - NAME' 'diego - NAME' 'durgesh - NAME'
 'durgeshm - NAME' 'fbakhda - NAME' 'firesh - NAME' 'foonghux - NAME'
 'garth - NAME' 'gavin - NAME' 'gio - NAME' 'giri - NAME'
 'gmathaly - NAME' 'guru - NAME' 'guys - NAME' 'harshita - NAME'
 'hashim - NAME' 'i - NAME' 'if - NAME' 'instead - NAME' 'james - NAME'
 'jchun - NAME' 'jchun change this - NAME' 'jchun close this - NAME'
 'jchun done configure - NAME' 'jchun duplicated hsdes - NAME'
 'jchun found out - NAME' 'jchun hi ajit - NAME' 'jchun hi all - NAME'
 'jchun hi amit - NAME' 'jchun hi athira - NAME' 'jchun

In [37]:
ner_wrapper(TRAIN_DATA,VAL_DATA,path,column=df_test["comments"],train_model=False)

Best model loaded!
['adrian - NAME' 'ajay - NAME' 'ajaykmal - NAME' 'all - NAME'
 'already - NAME' 'aparna - NAME' 'arasu - NAME' 'arisha - NAME'
 'armit - NAME' 'aslam - NAME' 'athira - NAME' 'aurobinda - NAME'
 'azfar - NAME' 'beik - NAME' 'chihquax - NAME' 'chows - NAME'
 'chung - NAME' 'clement - NAME' 'cristian - NAME' 'cristina - NAME'
 'dalwynkx - NAME' 'daryl - NAME' 'darylhe - NAME' 'deeksha - NAME'
 'deepika - NAME' 'deigo - NAME' 'diego - NAME' 'durgesh - NAME'
 'durgeshm - NAME' 'fbakhda - NAME' 'firesh - NAME' 'foonghux - NAME'
 'garth - NAME' 'gavin - NAME' 'gio - NAME' 'giri - NAME'
 'gmathaly - NAME' 'guru - NAME' 'guys - NAME' 'harshita - NAME'
 'hashim - NAME' 'i - NAME' 'if - NAME' 'instead - NAME' 'james - NAME'
 'jchun - NAME' 'jchun change this - NAME' 'jchun close this - NAME'
 'jchun done configure - NAME' 'jchun duplicated hsdes - NAME'
 'jchun found out - NAME' 'jchun hi ajit - NAME' 'jchun hi all - NAME'
 'jchun hi amit - NAME' 'jchun hi athira - NAME' 'jchun

### Feature extraction

In [22]:
# data preprocessing
#title
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
n=10
df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["title_rare"] = remove_rarewords(df["title_freq"],n)
df["title_lemma"] = lemmatize_words(column= df["title_rare"],lemma_type=None)
df["title_clean"] = df["title_lemma"]

#description
df["desc_cont"] = [word_contractions(text) for text in df["description"]]
df["desc_lower"] = [lowercase(text) for text in df["desc_cont"]]
df["desc_tag"] = [remove_htmltag_url(text) for text in df["desc_lower"]]
df["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["desc_num"] = [remove_num(text) for text in df["desc_rem"]]
df["desc_white"] = [remove_multwhitespace(text) for text in df["desc_num"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
n=10
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["desc_rare"] = remove_rarewords(df["desc_freq"],n)
df["desc_lemma"] = lemmatize_words(column= df["desc_rare"],lemma_type=None)
df["desc_clean"] = df["desc_lemma"]

df.head()

Frequent words that are removed from column: {('enhancement', 98), ('request', 90), ('test', 454), ('cases', 80), ('case', 77), ('execution', 94), ('gio', 502), ('cycle', 99), ('project', 93), ('add', 88)}
Frequent words that are removed from column: {('pass', 297), ('execution', 255), ('link', 230), ('project', 246), ('case', 334), ('gio', 1209), ('result', 246), ('cases', 290), ('test', 1632), ('cycle', 420)}
Rare words that are removed from column: {('plenty', 1), ('outside', 1), ('pulling', 1), ('relese', 1), ('opy', 1), ('fit', 1), ('traceability', 1), ('mst', 1), ('converting', 1), ('suc', 1)}


Unnamed: 0,title,description,title_cont,title_lower,title_tag,title_rem,title_num,title_white,title_stop,title_freq,...,desc_lower,desc_tag,desc_rem,desc_num,desc_white,desc_stop,desc_freq,desc_rare,desc_lemma,desc_clean
0,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,provide method to update GIO fields from git r...,provide method to update gio fields from git r...,provide method to update gio fields from git r...,provide method to update gio fields from git r...,provide method to update gio fields from git r...,provide method to update gio fields from git r...,provide method update gio fields git repo ...,provide method update fields git repo automati...,...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide way update gio fields git...,please provide way update fields git repo file...,please provide way update fields git repo file...,please provide way update field git repo file ...,please provide way update field git repo file ...
1,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,Test suite execution terminates before executi...,test suite execution terminates before executi...,test suite execution terminates before executi...,test suite execution terminates before executi...,test suite execution terminates before executi...,test suite execution terminates before executi...,test suite execution terminates executing ...,suite terminates executing tests,...,<p>test suite execution finished before execut...,test suite execution finished before executing...,test suite execution finished before executing...,test suite execution finished before executing...,test suite execution finished before executing...,test suite execution finished executing te...,suite finished executing tests error observed ...,suite finished executing tests error observed ...,suite finished executing test error observed s...,suite finished executing test error observed s...
2,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,Cloning defects from another test cycle is not...,cloning defects from another test cycle is not...,cloning defects from another test cycle is not...,cloning defects from another test cycle is not...,cloning defects from another test cycle is not...,cloning defects from another test cycle is not...,cloning defects another test cycle working,cloning defects another working,...,<p>i am trying to clone defects from another t...,i am trying to clone defects from another test...,i am trying to clone defects from another test...,i am trying to clone defects from another test...,i am trying to clone defects from another test...,trying clone defects another test cycl...,trying clone defects another get message cloni...,trying clone defects another get message cloni...,trying clone defect another get message clonin...,trying clone defect another get message clonin...
3,[Testing Only] this is enhancement only,Retest some function again.,[Testing Only] this is enhancement only,[testing only] this is enhancement only,[testing only] this is enhancement only,testing only this is enhancement only,testing only this is enhancement only,testing only this is enhancement only,testing enhancement,testing,...,retest some function again.,retest some function again.,retest some function again,retest some function again,retest some function again,retest function,retest function,retest function,retest function,retest function
4,[Testing Only] this is consultation only,enter the support needed at here ...,[Testing Only] this is consultation only,[testing only] this is consultation only,[testing only] this is consultation only,testing only this is consultation only,testing only this is consultation only,testing only this is consultation only,testing consultation,testing consultation,...,enter the support needed at here ...,enter the support needed at here ...,enter the support needed at here,enter the support needed at here,enter the support needed at here,enter support needed,enter support needed,enter support needed,enter support needed,enter support needed


In [23]:
df = df[["title_clean","desc_clean"]]
df.head()

Unnamed: 0,title_clean,desc_clean
0,provide method update field git repo automatic...,please provide way update field git repo file ...
1,suite terminates executing test,suite finished executing test error observed s...
2,cloning defect another working,trying clone defect another get message clonin...
3,testing,retest function
4,testing consultation,enter support needed


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def feature_extraction(column,ngram_range=None,ascending=None,fe_type=None):
    """
    Feature extraction methods - TF-IDF(default choice) or Bag of words
     
    params:
    column [series/DataFrame]: column selected for feature extraction 
                        - series: only one column is selected for feature extraction (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for feature extraction (e.g. df[["title_clean","desc_clean"]])
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - [default] ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    ascending [True/False/None]: - [default] None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)                               
    fe_type[string/None]: Feature extraction type: Choose "bagofwords" for bow or None for default tfidf method
    
    """
    if type(column) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        column = column.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
                    
    if ngram_range == None: #set ngram range as unigram by default
        ngram_range=(1,1)
        
    if fe_type == "bagofwords":
        vec_type = CountVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0).astype(int)

    if fe_type == None: #tfidf
        vec_type = TfidfVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0)
    
    if ascending != None:
            
        df = df.sort_values(by ='sum', axis = 1,ascending=ascending)
    
    
    return df,vec_type,vectorized

In [27]:
feature_extraction(column=df[["title_clean","desc_clean"]],ngram_range=None,ascending=False,fe_type=None)[0]

Unnamed: 0,requirement,suite,new,recipe,report,result,user,run,create,planned,...,combination,gstvideodecoder,vaapijpegenc,jonathon,eample,batch,evaluate,parser,copyreg,xmlrpc
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.185210,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,0.000000,0.000000,0.000000,0.000000,0.000000,0.173893,0.055408,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
901,0.000000,0.453482,0.000000,0.196870,0.000000,0.000000,0.086923,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
902,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
903,0.172206,0.000000,0.000000,0.000000,0.089181,0.000000,0.000000,0.084913,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [28]:
feature_extraction(column=df["title_clean"],ngram_range=None,ascending=False,fe_type=None)[0]

Unnamed: 0,result,new,requirement,create,suite,report,unable,email,reporting,planning,...,false,uboot,dunfell,eb,rebased,symphony,migrate,daemon,purpose,creator
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.340100,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
901,0.000000,0.000000,0.000000,0.000000,0.362528,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
902,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
903,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [24]:
df

Unnamed: 0,id,description
0,1308651592,Please provide a way to update GIO fields from...
1,1308671310,<p>Test suite execution finished before execut...
2,1308673361,<p>I am trying to clone defects from another t...
3,1507656633,Retest some function again.
4,1507656638,enter the support needed at here ...
...,...,...
899,22012641037,"<div><span style=""font-size: 12.18px;"">Hello,&..."
900,22012645565,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f..."
901,22012704243,<div>The schedule test suite allow for the use...
902,22012765885,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f..."


In [25]:
df1 = df.drop(["id"],axis=1)
df1

Unnamed: 0,description
0,Please provide a way to update GIO fields from...
1,<p>Test suite execution finished before execut...
2,<p>I am trying to clone defects from another t...
3,Retest some function again.
4,enter the support needed at here ...
...,...
899,"<div><span style=""font-size: 12.18px;"">Hello,&..."
900,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f..."
901,<div>The schedule test suite allow for the use...
902,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f..."


In [30]:
#description
df1["desc_cont"] = [word_contractions(text) for text in df1["description"]]
df1["desc_lower"] = [lowercase(text) for text in df1["desc_cont"]]
df1["desc_tag"] = [remove_htmltag_url(text) for text in df1["desc_lower"]]
df1["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df1["desc_tag"]]
df1["desc_num"] = [remove_num(text) for text in df1["desc_rem"]]
df1["desc_white"] = [remove_multwhitespace(text) for text in df1["desc_num"]]
#list of words to remove
remove_taxo = ["gio","fields","test"]
#list of words to maintain
include_taxo = ["test suite execution","kpi metric"]
df1["desc_taxo"]=  [custom_taxo(text,remove_taxo,include_taxo) for text in df1["desc_white"]]

# df1["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df1["desc_white"]]
# n=10
# df1["desc_freq"] = remove_freqwords(df1["desc_stop"],n)
# df1["desc_rare"] = remove_rarewords(df1["desc_freq"],n)
# df1["desc_lemma"] = lemmatize_words(column= df1["desc_rare"],lemma_type=None)
# df1["desc_clean"] = df1["desc_lemma"]

df1.head()

Unnamed: 0,description,desc_cont,desc_lower,desc_tag,desc_rem,desc_num,desc_white,desc_taxo
0,Please provide a way to update GIO fields from...,Please provide a way to update GIO fields from...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide a way to update gio fields from...,please provide a way to update from git re...
1,<p>Test suite execution finished before execut...,<p>Test suite execution finished before execut...,<p>test suite execution finished before execut...,test suite execution finished before executing...,test suite execution finished before executing...,test suite execution finished before executing...,test suite execution finished before executing...,test suite execution finished before executing...
2,<p>I am trying to clone defects from another t...,<p>I am trying to clone defects from another t...,<p>i am trying to clone defects from another t...,i am trying to clone defects from another test...,i am trying to clone defects from another test...,i am trying to clone defects from another test...,i am trying to clone defects from another test...,i am trying to clone defects from another cy...
3,Retest some function again.,Retest some function again.,retest some function again.,retest some function again.,retest some function again,retest some function again,retest some function again,retest some function again
4,enter the support needed at here ...,enter the support needed at here ...,enter the support needed at here ...,enter the support needed at here ...,enter the support needed at here,enter the support needed at here,enter the support needed at here,enter the support needed at here


### Unsupervised Learning
### i ) K-means clustering

In [29]:
df1 = df.copy()
df1.head()

Unnamed: 0,title_clean,desc_clean
0,provide method update field git repo automatic...,please provide way update field git repo file ...
1,suite terminates executing test,suite finished executing test error observed s...
2,cloning defect another working,trying clone defect another get message clonin...
3,testing,retest function
4,testing consultation,enter support needed


In [None]:
# df1 = df_manipulation(df1,how="any",keep="first",cols_tokeep=["title"],cols_todrop=None,impute_value=None,subset=["title"])
# df1["title_cont"] = [word_contractions(text) for text in df1["title"]]
# df1["title_lower"] = [lowercase(text) for text in df1["title_cont"]]
# df1["title_tag"] = [remove_htmltag_url(text) for text in df1["title_lower"]]
# df1["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df1["title_tag"]]
# df1["title_num"] = [remove_num(text) for text in df1["title_rem"]]
# df1["title_white"] = [remove_multwhitespace(text) for text in df1["title_num"]]
# df1["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df1["title_white"]]
# n=10

# df1["title_freq"] = remove_freqwords(df1["title_stop"],n)
# df1["title_rare"] = remove_rarewords(df1["title_freq"],n)
# df1["title_lemma_word"] = lemmatize_words(column= df1["title_rare"],lemma_type=None)
# df1["title_clean"] = df1["title_lemma_word"]
# df1.head()

In [23]:
from sklearn.cluster import KMeans
from sklearn import metrics

silhouette_avg_list = []
n_clusters_list = []
dicts = {}

def kmeans_clustering(column,top_n_terms,ngram_range=None,fe_type=None,n_clusters=None,max_n_clusters=None):
    """
    K- means clustering for unsupervised learning. User can choose either options:
    (1) provide the number of clusters or
    (2) provide the max number of clusters for kmeans to iterate through, the optimal number of clusters with highest 
    silhouette score will be chosen. Min number of clusters is fixed as 2
    
    params:
    column [series/DataFrame]: column(s) selected for clustering 
                        - series: only one column is selected for clustering (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for clustering (e.g. df[["title_clean","desc_clean"]])
    top_n_terms[int]: the top n terms in each cluster to be printed out
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                   - [default] ngram_range of (1, 1) means only unigrams, 
                                   - ngram_range of (1, 2) means unigrams and bigrams, 
                                   - ngram_range of (2, 2) means only bigram
    fe_type[string/None]: Feature extraction type: Choose "bagofwords" for bow or None for default tfidf method
    n_clusters[None/int]: number of clusters. Choose None for option (2)  
    max_n_clusters[None/int]: max number of clusters. Choose None for option (1)  
    """   
    #call feature extraction function    
    ascending = None 
    X = feature_extraction(column,ngram_range,ascending,fe_type)[0]
    X = X.drop(index='sum')
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]

    #user provides the number of clusters        
    if n_clusters != None:
        model = KMeans(n_clusters = n_clusters, random_state=42)
        model.fit_predict(X)
        labels = model.labels_

        silhouette_score = metrics.silhouette_score(X, labels,random_state=42)
        print("Silhouette score for",n_clusters,"clusters is",round(silhouette_score,3))
        
            
    #user provides the maximum number of clusters 
    if max_n_clusters != None:
        for n_clusters in range(2,max_n_clusters+1): 

            model = KMeans(n_clusters = n_clusters, random_state=42)
            model.fit_predict(X)
            labels = model.labels_

            silhouette_avg = metrics.silhouette_score(X, labels,random_state=42)
            print("For n_clusters =", n_clusters,"The silhouette_score is :", round(silhouette_avg,3))

            silhouette_avg_list.append(silhouette_avg)
            n_clusters_list.append(n_clusters)


        for i in range(len(n_clusters_list)):
            dicts[n_clusters_list[i]] = silhouette_avg_list[i]

        n_clusters_max = max(dicts,key=dicts.get)
        silhouette_avg_max = max(dicts.values())

        model = KMeans(n_clusters = n_clusters_max, random_state=42)
        model.fit_predict(X)
        labels = model.labels_
        n_clusters = n_clusters_max
        print("\nThe optimal number of clusters selected is",n_clusters_max,"with silhouette_score of",round(silhouette_avg_max,3),"\n") 
        
    print("Top",top_n_terms,"terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] #sort by descending order
    terms = vec_type.get_feature_names()
    for i in range(n_clusters):
        print("Cluster %d:" % i)
        print(['%s' % terms[ind] for ind in order_centroids[i, :top_n_terms]]) #top n terms in each cluster
        print("\n")
   
               
    return labels

#### Case 1: user provides the number of clusters ####

In [41]:
#feature extraction
column = df1[["title_clean","desc_clean"]]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type = "bagofwords",n_clusters=5,max_n_clusters=None)
df1

Silhouette score for 5 clusters is 0.77
Top 10 terms per cluster:
Cluster 0:
['user', 'suite', 'requirement', 'new', 'recipe', 'report', 'client', 'run', 'result', 'group']


Cluster 1:
['none', 'version', 'ssl', 'wrong', 'sslerror', 'number', 'rpyc', 'category', 'simple', 'check']


Cluster 2:
['gv', 'arkos', 'document', 'html', 'latex', 'log', 'root', 'driver', 'gvcheck', 'rw']


Cluster 3:
['py', 'mainthread', 'campaign', 'info', 'debug', 'download', 'anvilplugin', 'git', 'client', 'manager']


Cluster 4:
['yolo', 'ireq', 'rfc', 'sparse', 'dis', 'en', 'dense', 'jpeg', 'format', 'generate']




Unnamed: 0,title_clean,desc_clean,cluster
0,provide method update field git repo automatic...,please provide way update field git repo file ...,0
1,suite terminates executing test,suite finished executing test error observed s...,0
2,cloning defect another working,trying clone defect another get message clonin...,0
3,testing,retest function,0
4,testing consultation,enter support needed,0
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,0
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,0
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,0
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,0


In [42]:
column = df1["title_clean"]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type="bagofwords",n_clusters=5,max_n_clusters=None)
df1

Silhouette score for 5 clusters is 0.041
Top 10 terms per cluster:
Cluster 0:
['result', 'planned', 'api', 'kpi', 'page', 'plugin', 'entry', 'filter', 'save', 'unable']


Cluster 1:
['passing', 'rate', 'requirement', 'chart', 'coverage', 'planned', 'detail', 'reflect', 'correct', 'block']


Cluster 2:
['new', 'suite', 'create', 'recipe', 'client', 'planning', 'unable', 'schedule', 'failed', 'bronze']


Cluster 3:
['requirement', 'dng', 'link', 'yocto', 'help', 'management', 'adl', 'configure', 'import', 'gc']


Cluster 4:
['email', 'reporting', 'report', 'component', 'chart', 'custom', 'template', 'defect', 'ui', 'failed']




Unnamed: 0,title_clean,desc_clean,cluster
0,provide method update field git repo automatic...,please provide way update field git repo file ...,2
1,suite terminates executing test,suite finished executing test error observed s...,2
2,cloning defect another working,trying clone defect another get message clonin...,2
3,testing,retest function,2
4,testing consultation,enter support needed,2
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,2
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,2
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,2
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,2


#### Case 2: user provides max number of clusters ### 

In [46]:
column = df1[["title_clean","desc_clean"]]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type ="bagofwords",n_clusters=None,max_n_clusters=20)
df1


For n_clusters = 2 The silhouette_score is : 0.909
For n_clusters = 3 The silhouette_score is : 0.878
For n_clusters = 4 The silhouette_score is : 0.853
For n_clusters = 5 The silhouette_score is : 0.77
For n_clusters = 6 The silhouette_score is : 0.09
For n_clusters = 7 The silhouette_score is : 0.772
For n_clusters = 8 The silhouette_score is : 0.591
For n_clusters = 9 The silhouette_score is : 0.189
For n_clusters = 10 The silhouette_score is : 0.647
For n_clusters = 11 The silhouette_score is : 0.689
For n_clusters = 12 The silhouette_score is : 0.605
For n_clusters = 13 The silhouette_score is : 0.618
For n_clusters = 14 The silhouette_score is : 0.472
For n_clusters = 15 The silhouette_score is : 0.044
For n_clusters = 16 The silhouette_score is : 0.378
For n_clusters = 17 The silhouette_score is : 0.168
For n_clusters = 18 The silhouette_score is : 0.276
For n_clusters = 19 The silhouette_score is : 0.205
For n_clusters = 20 The silhouette_score is : 0.096

The optimal number of

Unnamed: 0,title_clean,desc_clean,cluster
0,provide method update field git repo automatic...,please provide way update field git repo file ...,1
1,suite terminates executing test,suite finished executing test error observed s...,1
2,cloning defect another working,trying clone defect another get message clonin...,1
3,testing,retest function,1
4,testing consultation,enter support needed,1
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,1
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,1
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,1
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,1


In [44]:
#### Case 2: user provides max number of clusters ### 

column = df1["title_clean"]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type ="bagofwords",n_clusters=None,max_n_clusters=20)
df1

For n_clusters = 2 The silhouette_score is : 0.034
For n_clusters = 3 The silhouette_score is : 0.033
For n_clusters = 4 The silhouette_score is : 0.043
For n_clusters = 5 The silhouette_score is : 0.041
For n_clusters = 6 The silhouette_score is : 0.028
For n_clusters = 7 The silhouette_score is : 0.047
For n_clusters = 8 The silhouette_score is : 0.042
For n_clusters = 9 The silhouette_score is : 0.05
For n_clusters = 10 The silhouette_score is : 0.03
For n_clusters = 11 The silhouette_score is : 0.04
For n_clusters = 12 The silhouette_score is : 0.034
For n_clusters = 13 The silhouette_score is : 0.038
For n_clusters = 14 The silhouette_score is : 0.032
For n_clusters = 15 The silhouette_score is : 0.021
For n_clusters = 16 The silhouette_score is : 0.035
For n_clusters = 17 The silhouette_score is : 0.008
For n_clusters = 18 The silhouette_score is : 0.034
For n_clusters = 19 The silhouette_score is : -0.011
For n_clusters = 20 The silhouette_score is : 0.038

The optimal number of

Unnamed: 0,title_clean,desc_clean,cluster
0,provide method update field git repo automatic...,please provide way update field git repo file ...,0
1,suite terminates executing test,suite finished executing test error observed s...,3
2,cloning defect another working,trying clone defect another get message clonin...,3
3,testing,retest function,3
4,testing consultation,enter support needed,3
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,3
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,3
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,3
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,3


### ii) LDA

In [47]:
df1 = df1.drop("cluster",axis=1)
df1.head()

Unnamed: 0,title_clean,desc_clean
0,provide method update field git repo automatic...,please provide way update field git repo file ...
1,suite terminates executing test,suite finished executing test error observed s...
2,cloning defect another working,trying clone defect another get message clonin...
3,testing,retest function
4,testing consultation,enter support needed


In [56]:
# Implementation of LDA:
from sklearn.decomposition import LatentDirichletAllocation

def lda(column,n_components,top_n_terms,ngram_range=None):
    """
    LDA for unsupervised learning. Bag of words is selected for feature extraction
    params:
    column [series/DataFrame]: column(s) selected for lda
                        - series: only one column is selected for lda (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for lda (e.g. df[["title_clean","desc_clean"]])
    n_components[int]: the number of topics/clusters used in the lda_model
    top_n_terms[int]: the top n terms in each topic/cluster to be printed out
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                   - [default] ngram_range of (1, 1) means only unigrams, 
                                   - ngram_range of (1, 2) means unigrams and bigrams, 
                                   - ngram_range of (2, 2) means only bigram
    
    """
    
    #feature extraction
    ascending = None
    fe_type = "bagofwords"
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    vectorized = feature_extraction(column,ngram_range,ascending,fe_type)[2]

    # Create object for the LDA class 
    lda_model = LatentDirichletAllocation(n_components, random_state = 42)  
    lda_model.fit(vectorized)
    
    # Components_ gives us our topic distribution 
    topic_words = lda_model.components_

    # Top n words for a topic

    for i,topic in enumerate(topic_words):
        print(f"The top {top_n_terms} words for topic #{i}")
        print([vec_type.get_feature_names()[index] for index in topic.argsort()[-top_n_terms:]])
        print("\n")
        
    topic_results = lda_model.transform(vectorized) #probabilities of doc belonging to particular topic
    
    
    return topic_results.argmax(axis=1)

In [57]:
#user provides number of component and top n terms in each cluster/topic
column = df1["title_clean"]

#LDA
df1["topic"] = lda(column,n_components=5,top_n_terms=10,ngram_range=None)
df1

The top 10 words for topic #0
['schedule', 'group', 'dng', 'link', 'list', 'yocto', 'email', 'component', 'new', 'report']


The top 10 words for topic #1
['hostname', 'issue', 'fail', 'automation', 'unable', 'plugin', 'window', 'host', 'client', 'log']


The top 10 words for topic #2
['management', 'page', 'run', 'planning', 'bronze', 'recipe', 'suite', 'requirement', 'ehl', 'result']


The top 10 words for topic #3
['cloning', 'api', 'python', 'error', 'tgl', 'new', 'defect', 'kpi', 'result', 'create']


The top 10 words for topic #4
['failed', 'link', 'help', 'adl', 'new', 'dng', 'chart', 'email', 'requirement', 'reporting']




Unnamed: 0,title_clean,desc_clean,topic
0,provide method update field git repo automatic...,please provide way update field git repo file ...,4
1,suite terminates executing test,suite finished executing test error observed s...,1
2,cloning defect another working,trying clone defect another get message clonin...,3
3,testing,retest function,4
4,testing consultation,enter support needed,4
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,2
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,3
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,0
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,3


In [58]:
column = df1[["title_clean","desc_clean"]]

#LDA
df1["topic"] = lda(column,n_components=5,top_n_terms=10,ngram_range=None)
df1

The top 10 words for topic #0
['daily', 'automation', 'new', 'run', 'data', 'need', 'bronze', 'user', 'suite', 'recipe']


The top 10 words for topic #1
['arkos', 'local', 'id', 'sh', 'service', 'usr', 'root', 'otc', 'start', 'gv']


The top 10 words for topic #2
['adl', 'sve', 'tgl', 'group', 'create', 'dng', 'new', 'yocto', 'program', 'requirement']


The top 10 words for topic #3
['file', 'host', 'version', 'python', 'error', 'py', 'run', 'suite', 'log', 'client']


The top 10 words for topic #4
['defect', 'chart', 'component', 'user', 'reporting', 'page', 'planned', 'result', 'email', 'report']




Unnamed: 0,title_clean,desc_clean,topic
0,provide method update field git repo automatic...,please provide way update field git repo file ...,3
1,suite terminates executing test,suite finished executing test error observed s...,3
2,cloning defect another working,trying clone defect another get message clonin...,0
3,testing,retest function,1
4,testing consultation,enter support needed,2
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,2
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,0
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,0
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,0


### iii)  NMF factorization

In [59]:
df1 = df1.drop(["topic"],axis=1)
df1.head()

Unnamed: 0,title_clean,desc_clean
0,provide method update field git repo automatic...,please provide way update field git repo file ...
1,suite terminates executing test,suite finished executing test error observed s...
2,cloning defect another working,trying clone defect another get message clonin...
3,testing,retest function
4,testing consultation,enter support needed


In [60]:
from sklearn.decomposition import NMF

def nmf(column,n_components,top_n_terms,fe_type,ngram_range=None):
    """
    Non-negative matrix factorization for unsupervised learning.
    params:
    column [series/DataFrame]: column(s) selected for NMF 
                        - series: only one column is selected for NMF (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for NMF (e.g. df[["title_clean","desc_clean"]])
    n_components[int]: the number of topics/clusters used in NMF
    top_n_terms[int]: the top n terms in each topic/cluster to be printed out
    fe_type[string/None]: Feature extraction type: Choose "bagofwords" for bow or None for default tfidf method
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                   - [default] ngram_range of (1, 1) means only unigrams, 
                                   - ngram_range of (1, 2) means unigrams and bigrams, 
                                   - ngram_range of (2, 2) means only bigram
    """
    #feature extraction
    ngram_range = None
    ascending = None
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    vectorized = feature_extraction(column,ngram_range,ascending,fe_type)[2]

    # Create object for the NMF class 
    nmf_model = NMF(n_components,random_state=42)
    nmf_model.fit(vectorized)
    
    # Components_ gives us our topic distribution 
    topic_words = nmf_model.components_

    # Top n words for a topic

    for i,topic in enumerate(topic_words):
        print(f"The top {top_n_terms} words for topic #{i}")
        print([vec_type.get_feature_names()[index] for index in topic.argsort()[-top_n_terms:]])
        print("\n")
        
    topic_results = nmf_model.transform(vectorized) 
    
    return topic_results.argmax(axis=1)

In [61]:
#user provides number of component and top n terms in each cluster/topic
column = df1["title_clean"]

#NMF
df1["topic"] = nmf(column,n_components=5,top_n_terms=10,fe_type="bagofwords",ngram_range=None)
df1

The top 10 words for topic #0
['upload', 'save', 'plugin', 'filter', 'python', 'page', 'kpi', 'api', 'planned', 'result']


The top 10 words for topic #1
['adl', 'page', 'sync', 'import', 'ee', 'spr', 'oyb', 'dng', 'management', 'requirement']


The top 10 words for topic #2
['defect', 'group', 'enable', 'template', 'custom', 'chart', 'component', 'report', 'reporting', 'email']


The top 10 words for topic #3
['adl', 'suite', 'ppiv', 'name', 'window', 'pae', 'user', 'program', 'create', 'new']


The top 10 words for topic #4
['help', 'tsn', 'planning', 'adl', 'tgl', 'time', 'dng', 'link', 'ehl', 'yocto']






Unnamed: 0,title_clean,desc_clean,topic
0,provide method update field git repo automatic...,please provide way update field git repo file ...,3
1,suite terminates executing test,suite finished executing test error observed s...,3
2,cloning defect another working,trying clone defect another get message clonin...,2
3,testing,retest function,0
4,testing consultation,enter support needed,3
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,4
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,0
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,3
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,0


In [62]:
#user provides number of component and top n terms in each cluster/topic
# #feature extraction
column = df1[["title_clean","desc_clean"]]

#NMF
df1["topic"] = nmf(column,n_components=5,top_n_terms=10,fe_type="bagofwords",ngram_range=None)
df1

The top 10 words for topic #0
['service', 'client', 'rpyc', 'category', 'sslerror', 'wrong', 'number', 'ssl', 'version', 'none']


The top 10 words for topic #1
['gvcheck', 'rw', 'var', 'log', 'root', 'latex', 'html', 'document', 'arkos', 'gv']


The top 10 words for topic #2
['generate', 'format', 'jpeg', 'dense', 'en', 'dis', 'sparse', 'rfc', 'ireq', 'yolo']


The top 10 words for topic #3
['encoder', 'fc', 'video', 'slice', 'decoder', 'auditt', 'name', 'null', 'id', 'otc']


The top 10 words for topic #4
['check', 'wait', 'ssh', 'app', 'utils', 'running', 'core', 'info', 'remaining', 'second']






Unnamed: 0,title_clean,desc_clean,topic
0,provide method update field git repo automatic...,please provide way update field git repo file ...,3
1,suite terminates executing test,suite finished executing test error observed s...,0
2,cloning defect another working,trying clone defect another get message clonin...,3
3,testing,retest function,2
4,testing consultation,enter support needed,3
...,...,...,...
899,import gc time global domain artifact ehl,hello please import time global domain time kp...,3
900,kpi metric extract kpi metric trend across cycle,hi team thank providing kpi metric feature sto...,3
901,ability clone schedule suite program,schedule suite allow user clone suite recipe a...,3
902,kpi metric enhance kpi feature graph test kpi ...,hi team thank providing kpi feature plot kpi m...,3


### Supervised Learning

In [64]:
#VICE dataset
import pandas as pd
path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"
df= pd.read_csv(path+"sip_sighting_usb_duplicate_ai.csv")
df.head()

Unnamed: 0,id,title,status,reason,merge_id,problem_area,submitted_date,root_caused_date,description,fix_description,comments,rev,tenant,subject,hierarchy_path,parent_id,record_type,cloned_id,record_index
0,22062762,[USB] Running SSP Traffic on mSLE causes Bandw...,root_caused,awaiting_fix,,,2017-04-19 00:48:43,2017-05-25 10:09:10,When executing the Get Port Bandwidth command ...,"<p>SIP bugeco:&nbsp;<a href=""https://hsdes.int...","++++146210633 mghender\nHi Sara, Is there any ...",9,sip,sighting,/101411699/1016006011/22062762/,1016006011,parent,22062762.0,1
1,220152061,SPT-H E0: USB2 port missing,rejected,wont_do,,,2017-05-12 00:47:16,,"<pre style=""word-wrap: break-word; white-space...",,,4,sip,sighting,/101411699/1016006011/220152061/,1016006011,parent,220152061.0,2
2,220421258,[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,rejected,wont_do,,noise.cannot_reproduce,2017-07-06 23:33:44,,"<pre style=""word-wrap: break-word; white-space...",Failure cannot be reproduced by customer for a...,,5,sip,sighting,/101411699/1016006011/220421258/,1016006011,parent,220421258.0,3
3,220634430,[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,rejected,cannot_reproduce,,noise.cannot_reproduce,2017-08-14 11:33:10,,"<p style=""font-size: 12.18px;""><span style=""fo...",<p>Rejecting this sighting as non-reproducible...,++++146564279 ppmeher\nFull cutrand log attach...,5,sip,sighting,/101411699/1016006011/220634430/,1016006011,parent,220634430.0,4
4,220634437,[ICL_A0_PO] Python SV: pci_config_registers ac...,root_caused,awaiting_review,,noise.non-issue,2017-08-14 11:36:26,2017-08-19 00:22:26,<p>Trying Recipe sent by Tamir</p><p>pci_confi...,<p>Failure is due to XHCI not coming up so the...,++++136216899 btamir\nHi There is no informati...,11,sip,sighting,/101411699/1016006011/220634437/,1016006011,parent,220634437.0,5


In [14]:
df["problem_area"].value_counts()

noise.validation_tools                    249
noise.duplicated_sighting                 178
bug.logic.xhci                            164
noise.test_content                        126
noise.non-issue                           112
                                         ... 
bug.circuit.isclk                           1
environment.sle.integration                 1
collateral.documentation.industry_spec      1
noise.debug_equipment                       1
bug.logic.usb3                              1
Name: problem_area, Length: 69, dtype: int64

In [65]:
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description","problem_area"],cols_todrop=None,impute_value=None,subset=None)
df

Shape of df before manipulation: (2078, 19)
Shape of df after selecting columns: (2078, 3)
Number of null values in df:
 title            0
description      0
problem_area    96
dtype: int64
Number of null values in df after dropping NA rows:
 title           0
description     0
problem_area    0
dtype: int64
Shape of df after dropping NA rows: (1982, 3)
Number of duplicates in the df: 3
Shape of df after manipulation: (1979, 3)


Unnamed: 0,title,description,problem_area
2,[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,"<pre style=""word-wrap: break-word; white-space...",noise.cannot_reproduce
3,[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,"<p style=""font-size: 12.18px;""><span style=""fo...",noise.cannot_reproduce
4,[ICL_A0_PO] Python SV: pci_config_registers ac...,<p>Trying Recipe sent by Tamir</p><p>pci_confi...,noise.non-issue
5,[ICL_A0_PO] Not able to get USB3 to train on TC3,<p>TC3 retimer programmed but still not able t...,environment.3rd_party.device
6,[ICL_A0_PO] Python SV: Not able to read write ...,"<p>Updated pythonsv icelake folder on sept, 20...",noise.non-issue
...,...,...,...
2069,[USB3.2][TGP-H][HAPS-80][E2E][ERTL] Gen1x2 u3 ...,"<p><span style=""font-size: 1em;"">RTL drop:&nbs...",noise.cannot_reproduce
2071,[USB3.2][TGP-H][HAPS-80][E2E][ERTL] Gen1 and G...,"<p><span style=""font-size: 1em;"">RTL drop:&nbs...",noise.cannot_reproduce
2072,[ADP-LP][FPGA][RTL0p8] TI Phy USB3 port 1 fail...,<p>Seeing consistent enumeration failure on Po...,environment.rtl.fpga
2076,[SVOS] [BUSTER] [TGP-H] device enumeration failed,<p>SVOS Kernel (BUSTER) :&nbsp;4.19.60<br /></...,noise.validation_tools


In [66]:
# data preprocessing
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
n=10
df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["title_rare"] = remove_rarewords(df["title_freq"],n)
df["title_lemma"] = lemmatize_words(column= df["title_rare"],lemma_type=None)
df["title_clean"] = df["title_lemma"]

df["desc_cont"] = [word_contractions(text) for text in df["description"]]
df["desc_lower"] = [lowercase(text) for text in df["desc_cont"]]
df["desc_tag"] = [remove_htmltag_url(text) for text in df["desc_lower"]]
df["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["desc_num"] = [remove_num(text) for text in df["desc_rem"]]
df["desc_white"] = [remove_multwhitespace(text) for text in df["desc_num"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
n=10
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["desc_rare"] = remove_rarewords(df["desc_freq"],n)
df["desc_lemma"] = lemmatize_words(column= df["desc_rare"],lemma_type=None)
df["desc_clean"] = df["desc_lemma"]

df.head()

Frequent words that are removed from column: {('device', 317), ('icl', 288), ('test', 419), ('fpga', 400), ('xhci', 309), ('u', 397), ('gen', 445), ('x', 330), ('lkf', 386), ('usb', 1329)}
Rare words that are removed from column: {('strm', 1), ('pri', 1), ('ic', 1), ('modify', 1), ('tool', 1), ('simultaneous', 1), ('rxpolarity', 1), ('locations', 1), ('buster', 1), ('writes', 1)}
Frequent words that are removed from column: {('f', 7302), ('x', 12241), ('h', 6273), ('v', 6939), ('b', 7405), ('c', 6713), ('n', 6287), ('z', 6264), ('u', 6780), ('p', 7726)}
Rare words that are removed from column: {('emitted', 1), ('involved', 1), ('synchronize', 1), ('resolving', 1), ('reimaged', 1), ('responsive', 1), ('assumed', 1), ('asynchronously', 1), ('rxpolarity', 1), ('narrowing', 1)}


Unnamed: 0,title,description,problem_area,title_cont,title_lower,title_tag,title_rem,title_num,title_white,title_stop,...,desc_lower,desc_tag,desc_rem,desc_num,desc_white,desc_stop,desc_freq,desc_rare,desc_lemma,desc_clean
2,[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,"<pre style=""word-wrap: break-word; white-space...",noise.cannot_reproduce,[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,[apple - basin falls/kbp-h] - xhci blocking s3...,[apple - basin falls/kbp-h] - xhci blocking s3...,apple basin falls kbp h xhci blocking s3...,apple basin falls kbp h xhci blocking s ...,apple basin falls kbp h xhci blocking s resum...,apple basin falls kbp h xhci blocking resum...,...,"<pre style=""word-wrap: break-word; white-space...",basin falls uses a server derived cpu with a k...,basin falls uses a server derived cpu with a k...,basin falls uses a server derived cpu with a k...,basin falls uses a server derived cpu with a k...,basin falls uses server derived cpu kbp ...,basin falls uses server derived cpu kbp pch de...,basin falls uses server derived cpu kbp pch de...,basin fall us server derived cpu kbp pch desig...,basin fall us server derived cpu kbp pch desig...
3,[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,"<p style=""font-size: 12.18px;""><span style=""fo...",noise.cannot_reproduce,[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,[cnp_b0]:usb3 loopback - xhci_debug_device tes...,[cnp_b0]:usb3 loopback - xhci_debug_device tes...,cnp b0 usb3 loopback xhci debug device tes...,cnp b usb loopback xhci debug device tes...,cnp b usb loopback xhci debug device test fai...,cnp b usb loopback xhci debug device test fai...,...,"<p style=""font-size: 12.18px;""><span style=""fo...",xhci_debug_device test failed on cnl b0 platfo...,xhci debug device test failed on cnl b0 platfo...,xhci debug device test failed on cnl b platfo...,xhci debug device test failed on cnl b platfor...,xhci debug device test failed cnl b platform...,xhci debug device test failed cnl platform fai...,xhci debug device test failed cnl platform fai...,xhci debug device test failed cnl platform fai...,xhci debug device test failed cnl platform fai...
4,[ICL_A0_PO] Python SV: pci_config_registers ac...,<p>Trying Recipe sent by Tamir</p><p>pci_confi...,noise.non-issue,[ICL_A0_PO] Python SV: pci_config_registers ac...,[icl_a0_po] python sv: pci_config_registers ac...,[icl_a0_po] python sv: pci_config_registers ac...,icl a0 po python sv pci config registers ac...,icl a po python sv pci config registers ac...,icl a po python sv pci config registers acces...,icl po python sv pci config registers acces...,...,<p>trying recipe sent by tamir</p><p>pci_confi...,trying recipe sent by tamir pci_config_registe...,trying recipe sent by tamir pci config registe...,trying recipe sent by tamir pci config registe...,trying recipe sent by tamir pci config registe...,trying recipe sent tamir pci config register...,trying recipe sent tamir pci config registers ...,trying recipe sent tamir pci config registers ...,trying recipe sent tamir pci config register a...,trying recipe sent tamir pci config register a...
5,[ICL_A0_PO] Not able to get USB3 to train on TC3,<p>TC3 retimer programmed but still not able t...,environment.3rd_party.device,[ICL_A0_PO] Not able to get USB3 to train on TC3,[icl_a0_po] not able to get usb3 to train on tc3,[icl_a0_po] not able to get usb3 to train on tc3,icl a0 po not able to get usb3 to train on tc3,icl a po not able to get usb to train on tc,icl a po not able to get usb to train on tc,icl po able get usb train tc,...,<p>tc3 retimer programmed but still not able t...,tc3 retimer programmed but still not able to g...,tc3 retimer programmed but still not able to g...,tc retimer programmed but still not able to g...,tc retimer programmed but still not able to ge...,tc retimer programmed still able get usb...,tc retimer programmed still able get usb train...,tc retimer programmed still able get usb train...,tc retimer programmed still able get usb train...,tc retimer programmed still able get usb train...
6,[ICL_A0_PO] Python SV: Not able to read write ...,"<p>Updated pythonsv icelake folder on sept, 20...",noise.non-issue,[ICL_A0_PO] Python SV: Not able to read write ...,[icl_a0_po] python sv: not able to read write ...,[icl_a0_po] python sv: not able to read write ...,icl a0 po python sv not able to read write ...,icl a po python sv not able to read write ...,icl a po python sv not able to read write mgp...,icl po python sv able read write mgphy ...,...,"<p>updated pythonsv icelake folder on sept, 20...","updated pythonsv icelake folder on sept, 20th....",updated pythonsv icelake folder on sept 20th ...,updated pythonsv icelake folder on sept th ...,updated pythonsv icelake folder on sept th sin...,updated pythonsv icelake folder sept th sinc...,updated pythonsv icelake folder sept th since ...,updated pythonsv icelake folder sept th since ...,updated pythonsv icelake folder sept th since ...,updated pythonsv icelake folder sept th since ...


In [67]:
df = df[["title_clean","desc_clean","problem_area"]]
df

Unnamed: 0,title_clean,desc_clean,problem_area
2,apple basin fall kbp h blocking resume warm re...,basin fall us server derived cpu kbp pch desig...,noise.cannot_reproduce
3,cnp b loopback debug fails missed event success,xhci debug device test failed cnl platform fai...,noise.cannot_reproduce
4,po python sv pci config register access failed,trying recipe sent tamir pci config register a...,noise.non-issue
5,po able get train tc,tc retimer programmed still able get usb train...,environment.3rd_party.device
6,po python sv able read write mgphy register py...,updated pythonsv icelake folder sept th since ...,noise.non-issue
...,...,...,...
2069,tgp h hap e e ertl failing seabright link drop,rtl drop amr corp intel com fmdeg vice la wave...,noise.cannot_reproduce
2071,tgp h hap e e ertl rtd test failing seabright ...,rtl drop amr corp intel com fmdeg vice la wave...,noise.cannot_reproduce
2072,adp lp rtl p ti phy port fails enumeration win...,seeing consistent enumeration failure port win...,environment.rtl.fpga
2076,svos tgp h enumeration failed,svos kernel buster svfs version svfs module sv...,noise.validation_tools


In [87]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import metrics
import joblib
import numpy as np 


def supervised_lng(X,y,test_size,ngram_range=None,fe_type=None,model_type=None,ascend=None,save_path=None):
    """
    Consists of 3 supervised machine learning methods: RandomForest (Default), Naive Bayes(optional, SVM (optional)
    
    X[series/DataFrame]: column(s) of text for supervised learning
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]])
    y[series]: target 
    test_size[float/int]: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
                          If int, represents the absolute number of test samples.
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       -[DEFAULT] ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    fe_type[None/string]: Feature extraction type: Choose "bagofwords" or None for default tfidf method
    model_type[None/string]: Choose ML algorithm 
                            - None (Default algorithm is Random Forest)
                            - 'NB'(To choose Naive Bayes as ML algorithm), 
                            - 'SVM'(To choose Support Vector Machine as ML algorithm)
    ascend[True/False/None]:  - None (Default: Confusion matrix is arranged in alphabetical order)
                              - True(Confusion matrix arranged in ascending order of accuracy % per label), 
                              - False(Confusion matrix arranged in descending order of accuracy % per label)  
    save_path[None/string]: Path to save model
                            - None (Default - Model is not saved)
                            - String (Model is saved as model.joblib in the save_path specified as a string)
        
    """
    #TRAIN-TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
    print("Train-test split completed with",(1-test_size)*100,"-",test_size*100,"split in train-test")
    print("Shape of X_train is:", X_train.shape)
    print("Shape of X_test is:",X_test.shape)
    print("Shape of y_train is:",y_train.shape)
    print("Shape of y_test is:",y_test.shape)
    
    if type(X_train) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_train = X_train.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)         
#         display(X_train)
    if type(X_test) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_test = X_test.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
#         display(X_test)
    
    #FEATURE EXTRACTION
    column = X_train       
    ascending = None
    #fit_transform X_train
    X_train = feature_extraction(column,ngram_range,ascending,fe_type)[2]
    #only transform X_test
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    X_test = vec_type.transform(X_test)
    
    
    print("Shape of X_train after feature extraction:",X_train.shape)
    print("Shape of X_test after feature extraction:",X_test.shape)
    
    #MODEL BUILDING
    if model_type == None:
        #random forest is chosen by default
        model = RandomForestClassifier(random_state = 42)
    
    if model_type == "NB":
        model = MultinomialNB()
                   
    if model_type == "SVM":
        model = svm.SVC(random_state = 42)
    
    model.fit(X_train, y_train) 
    
    #MODEL SAVING
    if save_path != None:
        joblib.dump(model, path + "model.joblib")
        print("Model saved!")

    # predicting test set results
    y_pred = model.predict(X_test)

    # MODEL EVALUATION
    print('Overall accuracy achieved is ' + str(round(metrics.accuracy_score(y_test, y_pred)*100,2)) + "%")
    print("Classification report:\n",metrics.classification_report(y_test, y_pred,zero_division=0))

    #confusion matrix with accuracies for each label
    class_accuracies = []

    for class_ in y_test.sort_values(ascending= True).unique():
        class_acc = round(np.mean(y_pred[y_test == class_] == class_)*100,2)
        class_accuracies.append(class_acc)
    class_acc = pd.DataFrame(class_accuracies,index=y_test.sort_values(ascending= True).unique(),columns= ["Accuracy %"])

    cf_matrix = pd.DataFrame(
        metrics.confusion_matrix(y_test, y_pred, labels= y_test.sort_values(ascending= True).unique()), 
        index=y_test.sort_values(ascending= True).unique(), 
        columns=y_test.sort_values(ascending= True).unique()
    )
    
    if ascend == None:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
    else:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)
          
    display(cf_matrix)     
    

In [88]:
# X = df["title_clean"]
X= df[["title_clean","desc_clean"]]
y= df["problem_area"]
test_size = 0.3
ngram_range = None
fe_type = "bagofwords"
model_type = None
save_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"
ascend= None
supervised_lng(X,y,test_size,ngram_range,fe_type,model_type,ascend,save_path)


Train-test split completed with 70.0 - 30.0 split in train-test
Shape of X_train is: (1385, 2)
Shape of X_test is: (594, 2)
Shape of y_train is: (1385,)
Shape of y_test is: (594,)
Shape of X_train after feature extraction: (1385, 245360)
Shape of X_test after feature extraction: (594, 245360)
Model saved!
Overall accuracy achieved is 24.75%
Classification report:
                                                 precision    recall  f1-score   support

                             bug.circuit.isclk       0.00      0.00      0.00         1
                       bug.circuit.modphy-usb3       0.00      0.00      0.00         5
                              bug.circuit.usb2       0.00      0.00      0.00         1
                                bug.duplicated       0.00      0.00      0.00        15
                         bug.duplicated_unique       0.00      0.00      0.00         1
                                 bug.logic.exi       0.00      0.00      0.00         1
                

Unnamed: 0,bug.circuit.isclk,bug.circuit.modphy-usb3,bug.circuit.usb2,bug.duplicated,bug.duplicated_unique,bug.logic.exi,bug.logic.modphy,bug.logic.modphy-usb3,bug.logic.pmc,bug.logic.usb3,...,noise.duplicated_sighting,noise.new_request,noise.non-issue,noise.test_content,noise.test_content.integration,noise.validation_tools,noise.validation_tools.maestro,noise.validation_tools.sv_fw,other,Accuracy %
bug.circuit.isclk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.circuit.modphy-usb3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,2,0,0,0,0.0
bug.circuit.usb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.0
bug.duplicated,0,0,0,0,0,0,0,0,0,0,...,2,0,1,0,0,4,0,0,0,0.0
bug.duplicated_unique,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.0
bug.logic.exi,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0
bug.logic.modphy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.logic.modphy-usb3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0.0
bug.logic.pmc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.logic.usb3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0.0


In [71]:
X = df["title_clean"]
# X= df[["title_clean","desc_clean"]]
y= df["problem_area"]
test_size = 0.3
ngram_range = None
fe_type = "bagofwords"
model_type = None
save_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"
ascend= None
supervised_lng(X,y,test_size,ngram_range,fe_type,model_type,ascend,save_path)


Train-test split completed with 70.0 - 30.0 split in train-test
Shape of X_train is: (1385,)
Shape of X_test is: (594,)
Shape of y_train is: (1385,)
Shape of y_test is: (594,)
Shape of X_train after feature extraction: (1385, 2011)
Shape of X_test after feature extraction: (594, 2011)
Model saved!
Overall accuracy achieved is 26.26%
Classification report:
                                                 precision    recall  f1-score   support

                             bug.circuit.isclk       0.00      0.00      0.00         1
                       bug.circuit.modphy-usb3       0.00      0.00      0.00         5
                              bug.circuit.usb2       0.00      0.00      0.00         1
                                bug.duplicated       0.07      0.07      0.07        15
                         bug.duplicated_unique       0.00      0.00      0.00         1
                                 bug.logic.exi       0.00      0.00      0.00         1
                        

Unnamed: 0,bug.circuit.isclk,bug.circuit.modphy-usb3,bug.circuit.usb2,bug.duplicated,bug.duplicated_unique,bug.logic.exi,bug.logic.modphy,bug.logic.modphy-usb3,bug.logic.pmc,bug.logic.usb3,...,noise.duplicated_sighting,noise.new_request,noise.non-issue,noise.test_content,noise.test_content.integration,noise.validation_tools,noise.validation_tools.maestro,noise.validation_tools.sv_fw,other,Accuracy %
bug.circuit.isclk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.circuit.modphy-usb3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0.0
bug.circuit.usb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.duplicated,0,0,0,1,0,0,0,0,0,0,...,3,0,0,3,0,2,0,0,0,6.67
bug.duplicated_unique,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.logic.exi,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0
bug.logic.modphy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.logic.modphy-usb3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0
bug.logic.pmc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
bug.logic.usb3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.0


In [89]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import joblib
import numpy as np 


def deep_lng(X,y,test_size,ngram_range,fe_type,hidden_layer_sizes=None,activation=None,solver=None,learning_rate=None,max_iter=None,ascend=None,save_path=None):
    """
     Deep learning method: MultiLayer Perceptron

    X[series/DataFrame]: column(s) of text for deep learning
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]])   
    y[series]: target
    test_size[float/int]: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
                          If int, represents the absolute number of test samples.
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    fe_type[string]: Feature extraction type: Choose "bagofwords" or "tfidf" method
    hidden_layer_sizes[tuple],default = (100): To set the number of layers and the number of nodes.
                                               Each element in the tuple represents the number of nodes,
                                               length of tuple denotes the total number of hidden layers in the network
    activation["identity", "logistic", "tanh","relu"], default="relu": Activation function for the hidden layer.
    solver["lbfgs", "sgd", "adam"], default="adam": The solver for weight optimization.
    learning_rate["constant", "invscaling", "adaptive"], default="constant": Learning rate schedule for weight updates
    max_iter[int], default=200: Maximum number of iterations. The solver iterates until convergence or this number of iterations.
    ascend [True/False/None]: - None (Default: Confusion matrix is arranged in alphabetical order)
                                 - True(Confusion matrix arranged in ascending order of accuracy % per label), 
                                 - False(Confusion matrix arranged in descending order of accuracy % per label)                            
    save_path[None/string]: Path to save model
                            - None (Default - Model is not saved)
                            - String (Model is saved as model.joblib in the save_path specified as a string)    
    """
    
    #train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
    print("Train-test split completed with",(1-test_size)*100,"-",test_size*100,"split in train-test")
    print("Shape of X_train is:", X_train.shape)
    print("Shape of X_test is:",X_test.shape)
    print("Shape of y_train is:",y_train.shape)
    print("Shape of y_test is:",y_test.shape)
    
    if type(X_train) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_train = X_train.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)         
        
    if type(X_test) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_test = X_test.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
        
    #FEATURE EXTRACTION
    column = X_train
    ascending = None
    #fit_transform X_train
    X_train = feature_extraction(column,ngram_range,ascending,fe_type)[2]
    #only transform X_test
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    X_test = vec_type.transform(X_test)
    print("Shape of X_train after feature extraction:",X_train.shape)
    print("Shape of X_test after feature extraction:",X_test.shape)
    
    #MODEL BUILDING
    #default hypermarameters
    if hidden_layer_sizes == None:
        hidden_layer_sizes = (100)
    if activation == None:
        activation = "relu"
    if solver == None:
        solver = "adam"
    if learning_rate == None:
        learning_rate = "constant"
    if max_iter == None:
        max_iter = 200
    
    print("Hidden layer sizes: ", hidden_layer_sizes,", Activation: ",activation,", Solver: ",solver,", Learning rate: ",learning_rate,", Max iteration: ",max_iter)
    
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=max_iter,verbose = False,random_state=42)
    model.fit(X_train,y_train)
    
    
    #MODEL SAVING
    if save_path != None:
        joblib.dump(model, path + "mlpmodel.joblib")
        print("Model saved!")

    # predicting test set results
    y_pred = model.predict(X_test)

    # MODEL EVALUATION
    print('Overall accuracy achieved is ' + str(round(metrics.accuracy_score(y_test, y_pred)*100,2)) + "%")
    print("Classification report:\n",metrics.classification_report(y_test, y_pred,zero_division=0))

    #confusion matrix with accuracies for each label
    class_accuracies = []

    for class_ in y_test.sort_values(ascending= True).unique():
        class_acc = round(np.mean(y_pred[y_test == class_] == class_)*100,2)
        class_accuracies.append(class_acc)
    class_acc = pd.DataFrame(class_accuracies,index=y_test.sort_values(ascending= True).unique(),columns= ["Accuracy %"])

    cf_matrix = pd.DataFrame(
        metrics.confusion_matrix(y_test, y_pred, labels= y_test.sort_values(ascending= True).unique()), 
        index=y_test.sort_values(ascending= True).unique(), 
        columns=y_test.sort_values(ascending= True).unique()
    )
    
    if ascend == None:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
    else:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)
          
    display(cf_matrix)     
    

In [97]:
X = df["title_clean"]
y= df["problem_area"]
test_size = 0.3
ngram_range = None
fe_type = None
hidden_layer_sizes = (5,5)
activation= None
solver=None
learning_rate=None
max_iter= None
ascend= False
save_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"

deep_lng(X,y,test_size,ngram_range,fe_type,hidden_layer_sizes,activation,solver,learning_rate,max_iter,ascend,save_path)

Train-test split completed with 70.0 - 30.0 split in train-test
Shape of X_train is: (1385,)
Shape of X_test is: (594,)
Shape of y_train is: (1385,)
Shape of y_test is: (594,)
Shape of X_train after feature extraction: (1385, 2011)
Shape of X_test after feature extraction: (594, 2011)
Hidden layer sizes:  (5, 5) , Activation:  relu , Solver:  adam , Learning rate:  constant , Max iteration:  200
Model saved!
Overall accuracy achieved is 16.33%
Classification report:
                                                 precision    recall  f1-score   support

                             bug.circuit.isclk       0.00      0.00      0.00         1
                       bug.circuit.modphy-usb3       0.00      0.00      0.00         5
                              bug.circuit.usb2       0.00      0.00      0.00         1
                                bug.duplicated       0.00      0.00      0.00        15
                         bug.duplicated_unique       0.00      0.00      0.00         1



Unnamed: 0,bug.circuit.isclk,bug.circuit.modphy-usb3,bug.circuit.usb2,bug.duplicated,bug.duplicated_unique,bug.logic.exi,bug.logic.modphy,bug.logic.modphy-usb3,bug.logic.pmc,bug.logic.usb3,...,noise.duplicated_sighting,noise.new_request,noise.non-issue,noise.test_content,noise.test_content.integration,noise.validation_tools,noise.validation_tools.maestro,noise.validation_tools.sv_fw,other,Accuracy %
noise.validation_tools,0,0,0,1,0,0,0,0,0,0,...,8,0,0,0,0,50,0,0,0,64.94
environment.sle.xtor,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,7,0,0,0,41.67
noise.validation_tools.maestro,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,3,7,0,0,31.82
environment.fpga,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,3,0,0,0,25.0
collateral.intel.fw.dekel,0,0,0,0,0,0,0,0,0,0,...,2,0,1,0,0,2,0,0,0,18.18
bug.logic.xhci,0,0,0,0,0,0,0,0,0,0,...,11,0,0,0,0,5,5,0,0,16.28
collateral.intel.fw.bios,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,3,3,0,0,15.79
environment.3rd_party.device,0,0,0,0,0,0,0,0,0,0,...,6,0,1,0,0,5,0,0,0,13.04
noise.duplicated_sighting,0,0,0,0,0,0,0,0,0,0,...,7,0,1,0,0,18,2,0,0,12.28
noise.cannot_reproduce,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,13,0,0,0,9.38


In [98]:
X = df[["title_clean","desc_clean"]]
y= df["problem_area"]
test_size = 0.3
ngram_range = None
fe_type = None
hidden_layer_sizes = (5,5)
activation= None
solver=None
learning_rate=None
max_iter= None
ascend= False
save_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"

deep_lng(X,y,test_size,ngram_range,fe_type,hidden_layer_sizes,activation,solver,learning_rate,max_iter,ascend,save_path)

Train-test split completed with 70.0 - 30.0 split in train-test
Shape of X_train is: (1385, 2)
Shape of X_test is: (594, 2)
Shape of y_train is: (1385,)
Shape of y_test is: (594,)
Shape of X_train after feature extraction: (1385, 245360)
Shape of X_test after feature extraction: (594, 245360)
Hidden layer sizes:  (5, 5) , Activation:  relu , Solver:  adam , Learning rate:  constant , Max iteration:  200
Model saved!
Overall accuracy achieved is 15.32%
Classification report:
                                                 precision    recall  f1-score   support

                             bug.circuit.isclk       0.00      0.00      0.00         1
                       bug.circuit.modphy-usb3       0.00      0.00      0.00         5
                              bug.circuit.usb2       0.00      0.00      0.00         1
                                bug.duplicated       0.00      0.00      0.00        15
                         bug.duplicated_unique       0.00      0.00      0.00  



Unnamed: 0,bug.circuit.isclk,bug.circuit.modphy-usb3,bug.circuit.usb2,bug.duplicated,bug.duplicated_unique,bug.logic.exi,bug.logic.modphy,bug.logic.modphy-usb3,bug.logic.pmc,bug.logic.usb3,...,noise.duplicated_sighting,noise.new_request,noise.non-issue,noise.test_content,noise.test_content.integration,noise.validation_tools,noise.validation_tools.maestro,noise.validation_tools.sv_fw,other,Accuracy %
noise.validation_tools.maestro,0,0,0,0,0,0,0,0,0,0,...,1,0,0,5,0,5,11,0,0,50.0
collateral.intel.fw.dekel,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,36.36
noise.validation_tools,0,0,0,2,0,0,0,0,0,0,...,16,0,1,17,0,27,2,0,0,35.06
noise.duplicated_sighting,0,0,0,2,0,0,0,1,0,0,...,13,0,8,8,0,8,0,0,0,22.81
noise.test_content,0,0,0,0,0,0,0,0,0,0,...,3,0,3,7,0,10,1,0,0,21.21
environment.sle.xtor,0,0,0,1,0,0,0,0,0,0,...,2,0,2,4,0,1,0,0,0,20.83
bug.logic.xhci,0,0,0,0,0,0,0,0,0,0,...,8,0,5,5,0,5,0,0,0,16.28
noise.non-issue,0,0,0,2,0,0,0,0,0,0,...,5,0,5,6,0,3,1,0,0,15.62
collateral.documenation.mphy_setting,0,0,0,1,0,0,0,1,0,0,...,2,0,0,0,0,0,0,0,0,14.29
environment.fpga,0,0,0,1,0,0,0,0,0,0,...,5,0,1,3,0,0,0,0,1,12.5


### Similarity metrics

In [99]:
#VICE dataset
import pandas as pd
path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"
df= pd.read_csv(path+"sip_sighting_usb_duplicate_ai.csv")
df.head()

Unnamed: 0,id,title,status,reason,merge_id,problem_area,submitted_date,root_caused_date,description,fix_description,comments,rev,tenant,subject,hierarchy_path,parent_id,record_type,cloned_id,record_index
0,22062762,[USB] Running SSP Traffic on mSLE causes Bandw...,root_caused,awaiting_fix,,,2017-04-19 00:48:43,2017-05-25 10:09:10,When executing the Get Port Bandwidth command ...,"<p>SIP bugeco:&nbsp;<a href=""https://hsdes.int...","++++146210633 mghender\nHi Sara, Is there any ...",9,sip,sighting,/101411699/1016006011/22062762/,1016006011,parent,22062762.0,1
1,220152061,SPT-H E0: USB2 port missing,rejected,wont_do,,,2017-05-12 00:47:16,,"<pre style=""word-wrap: break-word; white-space...",,,4,sip,sighting,/101411699/1016006011/220152061/,1016006011,parent,220152061.0,2
2,220421258,[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,rejected,wont_do,,noise.cannot_reproduce,2017-07-06 23:33:44,,"<pre style=""word-wrap: break-word; white-space...",Failure cannot be reproduced by customer for a...,,5,sip,sighting,/101411699/1016006011/220421258/,1016006011,parent,220421258.0,3
3,220634430,[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,rejected,cannot_reproduce,,noise.cannot_reproduce,2017-08-14 11:33:10,,"<p style=""font-size: 12.18px;""><span style=""fo...",<p>Rejecting this sighting as non-reproducible...,++++146564279 ppmeher\nFull cutrand log attach...,5,sip,sighting,/101411699/1016006011/220634430/,1016006011,parent,220634430.0,4
4,220634437,[ICL_A0_PO] Python SV: pci_config_registers ac...,root_caused,awaiting_review,,noise.non-issue,2017-08-14 11:36:26,2017-08-19 00:22:26,<p>Trying Recipe sent by Tamir</p><p>pci_confi...,<p>Failure is due to XHCI not coming up so the...,++++136216899 btamir\nHi There is no informati...,11,sip,sighting,/101411699/1016006011/220634437/,1016006011,parent,220634437.0,5


In [100]:
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description"],cols_todrop=None,impute_value=None,subset=None)
df

Shape of df before manipulation: (2078, 19)
Shape of df after selecting columns: (2078, 2)
Number of null values in df:
 title          0
description    0
dtype: int64
Number of null values in df after dropping NA rows:
 title          0
description    0
dtype: int64
Shape of df after dropping NA rows: (2078, 2)
Number of duplicates in the df: 21
Shape of df after manipulation: (2057, 2)


Unnamed: 0,title,description
0,[USB] Running SSP Traffic on mSLE causes Bandw...,When executing the Get Port Bandwidth command ...
1,SPT-H E0: USB2 port missing,"<pre style=""word-wrap: break-word; white-space..."
2,[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,"<pre style=""word-wrap: break-word; white-space..."
3,[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,"<p style=""font-size: 12.18px;""><span style=""fo..."
4,[ICL_A0_PO] Python SV: pci_config_registers ac...,<p>Trying Recipe sent by Tamir</p><p>pci_confi...
...,...,...
2073,[ADP-S][FPGA][RTL1p0][USB2] HS link drop after...,"<div style=""direction:ltr"">\n\n<table border=""..."
2074,[USB3.2][TGP-H][HAPS-80][E2E][ERTL] Gen2x2 u3 ...,<p>FPGA Image:&nbsp; 4_fpga_usbxcoe_top_tgph1p...
2075,[TGPH] [FPGA] [HAPS80-2] [USB Compliance] [Gen...,<p>FPGA Image:</p><p><br /></p><p>This test is...
2076,[SVOS] [BUSTER] [TGP-H] device enumeration failed,<p>SVOS Kernel (BUSTER) :&nbsp;4.19.60<br /></...


In [103]:
# data preprocessing
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
n=10

df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["title_rare"] = remove_rarewords(df["title_freq"],n)

df["title_lemma"] = lemmatize_words(column= df["title_rare"],lemma_type=None)

df["title_clean"] = df["title_lemma"]

df["desc_cont"] = [word_contractions(text) for text in df["description"]]
df["desc_lower"] = [lowercase(text) for text in df["desc_cont"]]
df["desc_tag"] = [remove_htmltag_url(text) for text in df["desc_lower"]]
df["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["desc_num"] = [remove_num(text) for text in df["desc_rem"]]
df["desc_white"] = [remove_multwhitespace(text) for text in df["desc_num"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
n=10
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["desc_rare"] = remove_rarewords(df["desc_freq"],n)
df["desc_lemma"] = lemmatize_words(column= df["desc_rare"],lemma_type=None)
df["desc_clean"] = df["desc_lemma"]

df.head()


Frequent words that are removed from column: {('test', 458), ('usb', 1392), ('device', 323), ('u', 429), ('fpga', 418), ('xhci', 312), ('icl', 289), ('gen', 490), ('lkf', 386), ('x', 378)}
Rare words that are removed from column: {('strm', 1), ('lof', 1), ('pri', 1), ('modify', 1), ('simultaneous', 1), ('rxpolarity', 1), ('buster', 1), ('writes', 1), ('locations', 1), ('initiation', 1)}
Frequent words that are removed from column: {('p', 7819), ('n', 6301), ('z', 6278), ('f', 7330), ('b', 7458), ('h', 6292), ('c', 6797), ('x', 12455), ('u', 6840), ('v', 6953)}
Rare words that are removed from column: {('involved', 1), ('resolving', 1), ('reimaged', 1), ('tnolfpsresponsetimeout', 1), ('elapses', 1), ('responsive', 1), ('environmental', 1), ('sbrt', 1), ('conform', 1), ('narrowing', 1)}


Unnamed: 0,title,description,title_cont,title_lower,title_tag,title_rem,title_num,title_white,title_stop,title_freq,...,desc_lower,desc_tag,desc_rem,desc_num,desc_white,desc_stop,desc_freq,desc_rare,desc_lemma,desc_clean
0,[USB] Running SSP Traffic on mSLE causes Bandw...,When executing the Get Port Bandwidth command ...,[USB] Running SSP Traffic on mSLE causes Bandw...,[usb] running ssp traffic on msle causes bandw...,[usb] running ssp traffic on msle causes bandw...,usb running ssp traffic on msle causes bandw...,usb running ssp traffic on msle causes bandw...,usb running ssp traffic on msle causes bandwi...,usb running ssp traffic msle causes bandwid...,running ssp traffic msle causes bandwidth calc...,...,when executing the get port bandwidth command ...,when executing the get port bandwidth command ...,when executing the get port bandwidth command ...,when executing the get port bandwidth command ...,when executing the get port bandwidth command ...,executing get port bandwidth command msl...,executing get port bandwidth command msle ssp ...,executing get port bandwidth command msle ssp ...,executing get port bandwidth command msle ssp ...,executing get port bandwidth command msle ssp ...
1,SPT-H E0: USB2 port missing,"<pre style=""word-wrap: break-word; white-space...",SPT-H E0: USB2 port missing,spt-h e0: usb2 port missing,spt-h e0: usb2 port missing,spt h e0 usb2 port missing,spt h e usb port missing,spt h e usb port missing,spt h e usb port missing,spt h e port missing,...,"<pre style=""word-wrap: break-word; white-space...","currently we are seeing 50dpm on this issue, w...",currently we are seeing 50dpm on this issue w...,currently we are seeing dpm on this issue we...,currently we are seeing dpm on this issue we h...,currently seeing dpm issue unit ...,currently seeing dpm issue unit usb port usb p...,currently seeing dpm issue unit usb port usb p...,currently seeing dpm issue unit usb port usb p...,currently seeing dpm issue unit usb port usb p...
2,[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,"<pre style=""word-wrap: break-word; white-space...",[Apple - Basin Falls/KBP-H] - xHCI Blocking S3...,[apple - basin falls/kbp-h] - xhci blocking s3...,[apple - basin falls/kbp-h] - xhci blocking s3...,apple basin falls kbp h xhci blocking s3...,apple basin falls kbp h xhci blocking s ...,apple basin falls kbp h xhci blocking s resum...,apple basin falls kbp h xhci blocking resum...,apple basin falls kbp h blocking resume warm r...,...,"<pre style=""word-wrap: break-word; white-space...",basin falls uses a server derived cpu with a k...,basin falls uses a server derived cpu with a k...,basin falls uses a server derived cpu with a k...,basin falls uses a server derived cpu with a k...,basin falls uses server derived cpu kbp ...,basin falls uses server derived cpu kbp pch de...,basin falls uses server derived cpu kbp pch de...,basin fall us server derived cpu kbp pch desig...,basin fall us server derived cpu kbp pch desig...
3,[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,"<p style=""font-size: 12.18px;""><span style=""fo...",[CNP_B0]:USB3 Loopback - xhci_debug_device tes...,[cnp_b0]:usb3 loopback - xhci_debug_device tes...,[cnp_b0]:usb3 loopback - xhci_debug_device tes...,cnp b0 usb3 loopback xhci debug device tes...,cnp b usb loopback xhci debug device tes...,cnp b usb loopback xhci debug device test fai...,cnp b usb loopback xhci debug device test fai...,cnp b loopback debug fails missed event success,...,"<p style=""font-size: 12.18px;""><span style=""fo...",xhci_debug_device test failed on cnl b0 platfo...,xhci debug device test failed on cnl b0 platfo...,xhci debug device test failed on cnl b platfo...,xhci debug device test failed on cnl b platfor...,xhci debug device test failed cnl b platform...,xhci debug device test failed cnl platform fai...,xhci debug device test failed cnl platform fai...,xhci debug device test failed cnl platform fai...,xhci debug device test failed cnl platform fai...
4,[ICL_A0_PO] Python SV: pci_config_registers ac...,<p>Trying Recipe sent by Tamir</p><p>pci_confi...,[ICL_A0_PO] Python SV: pci_config_registers ac...,[icl_a0_po] python sv: pci_config_registers ac...,[icl_a0_po] python sv: pci_config_registers ac...,icl a0 po python sv pci config registers ac...,icl a po python sv pci config registers ac...,icl a po python sv pci config registers acces...,icl po python sv pci config registers acces...,po python sv pci config registers access failed,...,<p>trying recipe sent by tamir</p><p>pci_confi...,trying recipe sent by tamir pci_config_registe...,trying recipe sent by tamir pci config registe...,trying recipe sent by tamir pci config registe...,trying recipe sent by tamir pci config registe...,trying recipe sent tamir pci config register...,trying recipe sent tamir pci config registers ...,trying recipe sent tamir pci config registers ...,trying recipe sent tamir pci config register a...,trying recipe sent tamir pci config register a...


In [104]:
df = df[["title_clean","desc_clean"]]
df

Unnamed: 0,title_clean,desc_clean
0,running ssp traffic msle cause bandwidth calcu...,executing get port bandwidth command msle ssp ...
1,spt h e port missing,currently seeing dpm issue unit usb port usb p...
2,apple basin fall kbp h blocking resume warm re...,basin fall us server derived cpu kbp pch desig...
3,cnp b loopback debug fails missed event success,xhci debug device test failed cnl platform fai...
4,po python sv pci config register access failed,trying recipe sent tamir pci config register a...
...,...,...
2073,adp rtl p h link drop l state port reset l sus...,hardware asus prime hap snp daughter card sier...
2074,tgp h hap e e ertl failing seabright link drop,fpga image fpga usbxcoe top tgph visa ww e e u...
2075,tgph hap compliance td low power downstream port,fpga image test intermittently passing failing...
2076,svos tgp h enumeration failed,svos kernel buster svfs version svfs module sv...


### Cosine similarity

In [118]:
from sklearn.metrics.pairwise import cosine_similarity

def cosinesimilarity(column,threshold=None,total_rows = None,base_row=None,ngram_range=None,fe_type=None,ascending=None):
    """
    Compute the cosine similarity between rows of texts. User can 
    a) fix number of rows for comparison, each row will be taken as base and compared with the rest
    b) fix one row as base, comparison will be done with all the other rows
    
    params:
    
    column[series/DataFrame]: column(s) of text for row wise similarity comparison
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]])  
    threshold[None/float]: cut off value for the cosine similarity, only texts with values above or equal to threshold
                           will be printed
                        - None: Default threhold is 0.5
                        - float: any value between 0 and 1 
    total_rows[None/int]: Number of rows for comparison, choose None for option b 
    base_row[None/int]: Row fixed as base, choose None for option a 
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    fe_type[None/string]: Feature extraction type: Choose "bagofwords" or None for tfidf
    ascending [True/False/None]: - [default] None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)  
    
    """     
    if type(column) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        column = column.apply(lambda row: ' '.join(row.values.astype(str)), axis=1) 
                
    #feature extraction              
    X = feature_extraction(column=column,ngram_range=ngram_range,ascending=None,fe_type=fe_type)[0]
    X = X.drop(["sum"],axis = 0)
    
    #Get cosine similarity matrix
    similarity_matrix = pd.DataFrame(cosine_similarity(X))
    
    #threshold
    if threshold == None:
        threshold = 0.5
       
    if total_rows !=None: #fix number of rows for comparison, each row will be taken as base and compared with the rest
        for base in range(total_rows): 
            print ("")
            print ("Using index " + str(base) + " as base:") #fix one index as base
            
            #Create empty df
            column_names = ["Index", "Similarity Score", "Text"]
            results = pd.DataFrame(columns = column_names)
            
            for i in range(total_rows): #compare base with other index
                
                if similarity_matrix.iloc[base,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
                    new_row = {'Index':i, 'Similarity Score':round(similarity_matrix.iloc[base,i],4), 'Text':column.iloc[i]}
                    #append row to the dataframe
                    results = results.append(new_row, ignore_index=True)
                    if ascending != None:            
                        results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)
                        
            display(results)
#             print(results['Similarity Score'].mean())
           

    if base_row !=None: #fix base_row index for comparison with all indexes
        print ("Using index " + str(base_row) + " as base:") #fix one index as base
        
        #Create empty df
        column_names = ["Index", "Similarity Score", "Text"]
        results = pd.DataFrame(columns = column_names)
        
        for i in range(len(column)): #compare base_row with other index
            if similarity_matrix.iloc[base_row,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
                new_row = {'Index':i, 'Similarity Score':round(similarity_matrix.iloc[base_row,i],4), 'Text':column.iloc[i]}
                #append row to the dataframe
                results = results.append(new_row, ignore_index=True)
                if ascending != None:            
                    results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)  
                    
        display(results) 
#         print(results['Similarity Score'].mean())

In [114]:
cosinesimilarity(column = df["title_clean"],threshold=None,total_rows=10,base_row=None,ngram_range=None,fe_type=None,ascending=None)


Using index 0 as base:


Unnamed: 0,Index,Similarity Score,Text
0,0,1.0,running ssp traffic msle cause bandwidth calcu...



Using index 1 as base:


Unnamed: 0,Index,Similarity Score,Text
0,1,1.0,spt h e port missing



Using index 2 as base:


Unnamed: 0,Index,Similarity Score,Text
0,2,1.0,apple basin fall kbp h blocking resume warm re...



Using index 3 as base:


Unnamed: 0,Index,Similarity Score,Text
0,3,1.0,cnp b loopback debug fails missed event success



Using index 4 as base:


Unnamed: 0,Index,Similarity Score,Text
0,4,1.0,po python sv pci config register access failed
1,6,0.5397,po python sv able read write mgphy register py...



Using index 5 as base:


Unnamed: 0,Index,Similarity Score,Text
0,5,1.0,po able get train tc



Using index 6 as base:


Unnamed: 0,Index,Similarity Score,Text
0,4,0.5397,po python sv pci config register access failed
1,6,1.0,po python sv able read write mgphy register py...



Using index 7 as base:


Unnamed: 0,Index,Similarity Score,Text
0,7,1.0,msle lgood error flag xtor exit



Using index 8 as base:


Unnamed: 0,Index,Similarity Score,Text
0,8,1.0,setting ped put port disabled port disabled pr...



Using index 9 as base:


Unnamed: 0,Index,Similarity Score,Text
0,9,1.0,hardware lpm capability register value different


In [115]:
cosinesimilarity(column = df["title_clean"],threshold= 0.5,total_rows=None,base_row=4,ngram_range=None,fe_type=None,ascending=None)

Using index 4 as base:


Unnamed: 0,Index,Similarity Score,Text
0,4,1.0,po python sv pci config register access failed
1,6,0.5397,po python sv able read write mgphy register py...
2,1345,0.5817,po python sv new hierarchy register access cpu


In [119]:
cosinesimilarity(column = df[["title_clean","desc_clean"]],threshold=None,total_rows=10,base_row=None,ngram_range=None,fe_type=None,ascending=None)


Using index 0 as base:


Unnamed: 0,Index,Similarity Score,Text
0,0,1.0,running ssp traffic msle cause bandwidth calcu...



Using index 1 as base:


Unnamed: 0,Index,Similarity Score,Text
0,1,1.0,spt h e port missing currently seeing dpm issu...



Using index 2 as base:


Unnamed: 0,Index,Similarity Score,Text
0,2,1.0,apple basin fall kbp h blocking resume warm re...



Using index 3 as base:


Unnamed: 0,Index,Similarity Score,Text
0,3,1.0,cnp b loopback debug fails missed event succes...



Using index 4 as base:


Unnamed: 0,Index,Similarity Score,Text
0,4,1.0,po python sv pci config register access failed...



Using index 5 as base:


Unnamed: 0,Index,Similarity Score,Text
0,5,1.0,po able get train tc tc retimer programmed sti...



Using index 6 as base:


Unnamed: 0,Index,Similarity Score,Text
0,6,1.0,po python sv able read write mgphy register py...



Using index 7 as base:


Unnamed: 0,Index,Similarity Score,Text
0,7,1.0,msle lgood error flag xtor exit running usb ge...



Using index 8 as base:


Unnamed: 0,Index,Similarity Score,Text
0,8,1.0,setting ped put port disabled port disabled pr...



Using index 9 as base:


Unnamed: 0,Index,Similarity Score,Text
0,9,1.0,hardware lpm capability register value differe...


In [121]:
cosinesimilarity(column = df[["title_clean","desc_clean"]],threshold=0,total_rows=None,base_row=4,ngram_range=None,fe_type=None,ascending=None)

Using index 4 as base:


Unnamed: 0,Index,Similarity Score,Text
0,0,0.0281,running ssp traffic msle cause bandwidth calcu...
1,1,0.0197,spt h e port missing currently seeing dpm issu...
2,2,0.0065,apple basin fall kbp h blocking resume warm re...
3,3,0.0342,cnp b loopback debug fails missed event succes...
4,4,1.0000,po python sv pci config register access failed...
...,...,...,...
2052,2052,0.0023,adp rtl p h link drop l state port reset l sus...
2053,2053,0.0000,tgp h hap e e ertl failing seabright link drop...
2054,2054,0.0050,tgph hap compliance td low power downstream po...
2055,2055,0.0097,svos tgp h enumeration failed svos kernel bust...


### Jaccard similarity

In [133]:
def jaccard_similarity(column,threshold=None,total_rows = None,base_row=None,ascending=None):
    """
    Compute the jaccard similarity between texts. User can 
    a) fix number of rows for comparison, each row will be taken as base and compared with the rest
    b) fix one row as base, comparison will be done with all the other rows
    
    params:
    column[series/DataFrame]: column(s) of text for row wise similarity comparison
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]]) 
    threshold[None/float]: cut off value for the jaccard similarity, only texts with values above or equal to threshold
                           will be printed
                        - None: Default threhold is 0.5
                        - float: any value between 0 and 1 
    total_rows[None/int]: Number of rows for comparison, choose None for option b 
    base_row[None/int]: Row fixed as base, choose None for option a 
    ascending [True/False/None]: - [default] None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)  
    
    """     
            
    #jaccard score computation
    def get_jaccard_sim(str1, str2):        
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    
    if type(column) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        column = column.apply(lambda row: ' '.join(row.values.astype(str)), axis=1) 
       
    #threshold
    if threshold == None:
        threshold = 0.5
        
    if total_rows !=None: #fix number of rows for comparison, each row will be taken as base and compared with the rest
        for base in range(total_rows): 
            print ("")
            print ("Using index " + str(base) + " as base:") #fix one index as base
            
            #Create empty df
            column_names = ["Index", "Similarity Score", "Text"]
            results = pd.DataFrame(columns = column_names)                   
            
            for i in range(total_rows): #compare base with other index
                jac_score =  round(get_jaccard_sim(column.iloc[base],column.iloc[i]),4)
                if jac_score > threshold: #print if comparison shows that silarity metric is more than threshold
                    new_row = {'Index':i, 'Similarity Score':jac_score, 'Text':column.iloc[i]}
                    #append row to the dataframe
                    results = results.append(new_row, ignore_index=True)
                if ascending != None:            
                    results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)  
                    
            display(results) 
        
    if base_row != None: #fix base_row index for comparison with all indexes
       
        print ("Using index " + str(base_row) + " as base row:") #fix one index as base_row
        #Create empty df
        column_names = ["Index", "Similarity Score", "Text"]
        results = pd.DataFrame(columns = column_names)                   
            
        for i in range(len(column)): #compare base_row with other index
            jac_score = round(get_jaccard_sim(column.iloc[base_row],column.iloc[i]),4)
            if jac_score >= threshold: #print if comparison shows that silarity metric is more than threshold
                new_row = {'Index':i, 'Similarity Score':jac_score, 'Text':column.iloc[i]}
                #append row to the dataframe
                results = results.append(new_row, ignore_index=True)
            if ascending != None:            
                results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)  

        display(results) 

In [129]:
jaccard_similarity(column= df["title_clean"],threshold=0.5,total_rows = 10,base_row=None,ascending=False)


Using index 0 as base:


Unnamed: 0,Index,Similarity Score,Text
0,0,1.0,running ssp traffic msle cause bandwidth calcu...



Using index 1 as base:


Unnamed: 0,Index,Similarity Score,Text
0,1,1.0,spt h e port missing



Using index 2 as base:


Unnamed: 0,Index,Similarity Score,Text
0,2,1.0,apple basin fall kbp h blocking resume warm re...



Using index 3 as base:


Unnamed: 0,Index,Similarity Score,Text
0,3,1.0,cnp b loopback debug fails missed event success



Using index 4 as base:


Unnamed: 0,Index,Similarity Score,Text
0,4,1.0,po python sv pci config register access failed



Using index 5 as base:


Unnamed: 0,Index,Similarity Score,Text
0,5,1.0,po able get train tc



Using index 6 as base:


Unnamed: 0,Index,Similarity Score,Text
0,6,1.0,po python sv able read write mgphy register py...



Using index 7 as base:


Unnamed: 0,Index,Similarity Score,Text
0,7,1.0,msle lgood error flag xtor exit



Using index 8 as base:


Unnamed: 0,Index,Similarity Score,Text
0,8,1.0,setting ped put port disabled port disabled pr...



Using index 9 as base:


Unnamed: 0,Index,Similarity Score,Text
0,9,1.0,hardware lpm capability register value different


In [130]:
jaccard_similarity(column= df["title_clean"],threshold=0.5,total_rows = None,base_row=4,ascending=None)

Using index 4 as base row:


Unnamed: 0,Index,Similarity Score,Text
0,4,1.0,po python sv pci config register access failed


In [135]:
jaccard_similarity(column= df[["title_clean","desc_clean"]],threshold=None,total_rows = 10,base_row=None,ascending=False)


Using index 0 as base:


Unnamed: 0,Index,Similarity Score,Text
0,0,1.0,running ssp traffic msle cause bandwidth calcu...



Using index 1 as base:


Unnamed: 0,Index,Similarity Score,Text
0,1,1.0,spt h e port missing currently seeing dpm issu...



Using index 2 as base:


Unnamed: 0,Index,Similarity Score,Text
0,2,1.0,apple basin fall kbp h blocking resume warm re...



Using index 3 as base:


Unnamed: 0,Index,Similarity Score,Text
0,3,1.0,cnp b loopback debug fails missed event succes...



Using index 4 as base:


Unnamed: 0,Index,Similarity Score,Text
0,4,1.0,po python sv pci config register access failed...



Using index 5 as base:


Unnamed: 0,Index,Similarity Score,Text
0,5,1.0,po able get train tc tc retimer programmed sti...



Using index 6 as base:


Unnamed: 0,Index,Similarity Score,Text
0,6,1.0,po python sv able read write mgphy register py...



Using index 7 as base:


Unnamed: 0,Index,Similarity Score,Text
0,7,1.0,msle lgood error flag xtor exit running usb ge...



Using index 8 as base:


Unnamed: 0,Index,Similarity Score,Text
0,8,1.0,setting ped put port disabled port disabled pr...



Using index 9 as base:


Unnamed: 0,Index,Similarity Score,Text
0,9,1.0,hardware lpm capability register value differe...


In [136]:
jaccard_similarity(column= df[["title_clean","desc_clean"]],threshold=None,total_rows = None,base_row=4,ascending=False)

Using index 4 as base row:


Unnamed: 0,Index,Similarity Score,Text
0,4,1.0,po python sv pci config register access failed...


In [None]:
# #user provides number of component and top n terms in each cluster/topic
# #feature extraction
# column = df3["title_lemma_word"]
# ngram_range = (1,1)
# ascending = False
# fe_type = "bagofwords"
# vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
# vectorized = feature_extraction(column,ngram_range,ascending,fe_type)[2]

# #NMF
# df3["topic"] = nmf(vectorized,vec_type,n_components=17,top_n_terms=10)
# df3

In [None]:
#convert and save train/validation data as .spacy
# out_path = "C:/Users/nchong/"
# db_train = convert_spacy(TRAIN_DATA)
# db_train.to_disk(out_path +'train.spacy') # save the docbin object
# db_val = convert_spacy(VAL_DATA)
# db_val.to_disk(out_path +'val.spacy') # save the docbin object

In [None]:
# !python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy

In [None]:
#load best model
# nlp1 = spacy.load("C:/Users/nchong/output/model-best/") #load the best model

In [None]:
# doc = nlp1("waikitcx hi arisha please provide us the") # input sample text

# spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
# def show_ents(text):
#     doc= nlp1(text)
#     if doc.ents:
#         for ent in doc.ents:
#             return(ent.text+' - '+ent.label_)
#     else:
#         return('No named entities found.')

In [None]:
# def data_loading(path,df=None,date=None):
#     '''
#     Load only files that follow agreed filename format, merge files as single dataframe.
#     Can support incremental aggregation of dataset, by setting arg df as the existing dataframe
#     Returns a single dataframe.
    
#     params:
#     path [string]: path of the files, without filename
#     df [dataframe] (optional,default is None): input existing dataframe to merge with new files
#     date ["string"](optional,default is None): user can choose to load only files from specific date in YYYY-MM-DD format
#     '''
#     filenames = os.listdir(path)
#     file_list=[]
#     dfs = []

#     if df is None: #no existing dataframe
        
#         for file in filenames:
#             # search agreed file format pattern in the filename
#             if date == None:
#                 pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"
                
#             else:
# #              
#                 pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
    
#             match = re.search(pattern,file)
#             #if match is found
#             if match:
#                 pattern = os.path.join(path, file) #join path with file name
#                 file_list.append(pattern) #list of json files that follow the agreed filename

#                 for file in file_list:
#                     with open(file) as f:
#                         #flatten json into pd dataframe
#                         json_data = pd.json_normalize(json.loads(f.read()))
#                         #label which file each row is from 
#                         json_data['file'] = file.rsplit("/", 1)[-1]

#                     dfs.append(json_data)
#                 df = pd.concat(dfs)
                
#     else: #existing dataframe exists and want to append new files to existing dataframe
             
#         for file in filenames:

#             if file not in df["file"].unique(): #check if file is new - to support merging of new dataset with previously read ones

#                 # search agreed file format pattern in the filename
                
#                 if date == None:
#                     pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"

#                 else:
#                     pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
                     
#                 match = re.search(pattern,file)

#                 #if match is found
#                 if match:
#                     json_pattern = os.path.join(path, file) #join path with file name
#                     file_list.append(json_pattern) #list of json files 

#                     for file in file_list:
#                         with open(file) as f:
#                             #flatten json into pd dataframe
#                             json_data = pd.json_normalize(json.loads(f.read()))
#                             #label which file each row is from 
#                             json_data['file'] = file.rsplit("/", 1)[-1]

#                         dfs.append(json_data)
#                     new_df = pd.concat(dfs)           
#                     df=pd.concat([df,new_df])
    
#     return df

In [None]:
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.tokenize import sent_tokenize
# from nltk.tokenize import WhitespaceTokenizer
# from nltk.tokenize import WordPunctTokenizer
# import re
# #remove token method - seperate nltk and split functions 
# def cust_tokenization(column,token_met,token_type,delim =None):
#     """
#     Custom tokenization, 2 options are available: split() or nltk 
#     params:
#     df [dataframe]: input dataframe 
#     token_met["string"]: input tokenization method ("split" or "nltk")
    
#     token_type["string"](use only if token_met= "nltk"): type of nltk tokenization
#     a) token_type = "WordToken" tokenizes a string into a list of words
#     b) token_type = "SentToken" tokenizes a string containing sentences into a list of sentences
#     c) token_type = "WhiteSpaceToken" tokenizes a string on whitespace (space, tab, newline)
#     d) token_type = "WordPunctTokenizer" tokenizes a string on punctuations
         
#     delim["string"](use only if token_met = "split"): specify delimiter to separate strings,
#     default delimiter (delim=None) is whitespace,  an alternate option for token_type = "WhiteSpaceToken"
    
#     """
#     if token_met == "split":
#         if delim==None:
#             print("Text is split by space") #default delimiter is space if not specified 

#         else:
#             print("Text is split by:", delim) #can accept one or more delimiter

#         return column.apply(lambda text: text.split() if delim==None else text.split(delim))
    

#     if token_met == "nltk":
    
#         if token_type == "WordToken":
#             tokenizer = word_tokenize
#         if token_type == "SentToken":
#             tokenizer = sent_tokenize
#         if token_type == "WhiteSpaceToken":
#             tokenizer = WhitespaceTokenizer().tokenize
#         if token_type == "WordPunctTokenizer":
#             tokenizer = WordPunctTokenizer().tokenize

#         return column.apply(lambda text: tokenizer(text))
        
        
    

In [None]:
# from datetime import datetime,timedelta
# def data_loading(path,date_list=None):
#     '''
#     Load only files that follow agreed filename format, merge files as single dataframe.
#     User can choose to load only files from specific date
    
#     params:
#     path [string]: path of the files, without filename
#     date_list ["list"](optional,default is None): user can choose to load only files from specific date in YYYY-MM-DD format
#     '''
    
#     filenames = os.listdir(path)
#     file_list=[]
#     df = pd.DataFrame()
    
#     if date_list == None:
#         for file in filenames:
#             # search agreed file format pattern in the filename

#             pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"

#             match = re.search(pattern,file)
                
#             #if match is found
#             if match:
#                 pattern = os.path.join(path, file) #join path with file name
#                 file_list.append(pattern) #list of json files that follow the agreed filename
            
#         print("Files read:",file_list)                   
#         for file in file_list:
#             with open(file) as f:
#                 #flatten json into pd dataframe
#                 json_data = pd.json_normalize(json.loads(f.read()))
#                 json_data = pd.DataFrame(json_data)
#                 #label which file each row is from 
#                 json_data['file'] = file.rsplit("/", 1)[-1]

#             df = df.append(json_data)              
                
#     else:
#         for file in filenames: 
            
#             # search agreed file format pattern in the filename
#             for date in date_list: 
#                 pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
        
#                 match = re.search(pattern,file)
                
#                 #if match is found
#                 if match:
#                     pattern = os.path.join(path, file) #join path with file name
#                     file_list.append(pattern) #list of json files that follow the agreed filename

#         print("Files read:",file_list)     
#         for file in file_list:
#             with open(file) as f:
#                 #flatten json into pd dataframe
#                 json_data = pd.json_normalize(json.loads(f.read()))
#                 json_data = pd.DataFrame(json_data)
#                 #label which file each row is from 
#                 json_data['file'] = file.rsplit("/", 1)[-1]

#             df = df.append(json_data)

#     return df

In [None]:
# #fix number of rows for comparison
# total_rows = 10 #total rows to consider for comparison
# threshold = 0.1 #similarity metric threshold
# column = df[["title_clean"]]

# for base in range(total_rows): 
#     print ("")
#     print ("Using index " + str(base) + " as base:") #fix one index as base
#     print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")

#     for i in range(total_rows): #compare base with other index
#         jac_score =  round(get_jaccard_sim(column.iloc[base].values[0],column.iloc[i].values[0]),4)
#         if jac_score > threshold: #print if comparison shows that silarity metric is more than threshold
#             print(f"{i : <10}{jac_score : <20}{column.iloc[i].values[0] : <500}")

In [None]:
# #fix base_row index for comparison with all indexes
# base_row=4
# threshold = 0
# column = df[["title_clean"]]

# print ("Using index " + str(base_row) + " as base row:") #fix one index as base_row
# print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")

# for i in range(len(column)): #compare base_row with other index
#     jac_score = round(get_jaccard_sim(column.iloc[base_row].values[0],column.iloc[i].values[0]),4)
#     if jac_score >= threshold: #print if comparison shows that silarity metric is more than threshold
#         print(f"{i : <10}{jac_score : <20}{column.iloc[i].values[0] : <500}")

In [None]:
# #feature extraction
# X = feature_extraction(column = df["title_clean"],ngram_range=(1,1),ascending=None,fe_type="tfidf")[0]
# X = X.drop(["sum"],axis = 0)
# X
#Cosine similarity
# from sklearn.metrics.pairwise import cosine_similarity
# similarity_matrix = pd.DataFrame(cosine_similarity(X))
# similarity_matrix
#user give total rows to compare
# total_rows = 10 #total rows to consider for comparison
# threshold = 0.2 #similarity metric threshold
# column = df[["title_clean"]]

# for base in range(total_rows): 
#     print ("")
#     print ("Using index " + str(base) + " as base:") #fix one index as base
#     print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")
#     for i in range(total_rows): #compare base with other index
#         if similarity_matrix.iloc[base,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#             print(f"{i : <10}{round(similarity_matrix.iloc[base,i],4) : <20}{column.iloc[i].values[0] : <500}")
#user give base to compare
# base_row = 4 #base for comparison
# threshold = 0.2 #similarity metric threshold
# column = df[["title_clean"]]

# print ("Using index " + str(base_row) + " as base:") #fix one index as base
# print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")
# for i in range(len(column)): #compare base_row with other index
#     if similarity_matrix.iloc[base_row,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#         print(f"{i : <10}{round(similarity_matrix.iloc[base_row,i],4) : <20}{column.iloc[i].values[0] : <500}")


In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# def cosinesimilarity(column,threshold,total_rows = None,base_row=None,ngram_range=None,fe_type=None):
    
#     #feature extraction
#     if ngram_range == None:
#         ngram_range = (1,1)
#     if fe_type == None:
#         fe_type ="tfidf"
    
       
#     X = feature_extraction(column=column,ngram_range=ngram_range,ascending=None,fe_type=fe_type)[0]
#     X = X.drop(["sum"],axis = 0)
    
#     #Get cosine similarity matrix
#     similarity_matrix = pd.DataFrame(cosine_similarity(X))
    
#     if total_rows !=None:
#         for base in range(total_rows): 
#             print ("")
#             print ("Using index " + str(base) + " as base:") #fix one index as base
#             print(f"{'Index' : <10}{'Similarity Score' : <20}{'Text' : <500}")
#             for i in range(total_rows): #compare base with other index
#                 if similarity_matrix.iloc[base,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#                     print(f"{i : <10}{round(similarity_matrix.iloc[base,i],4) : <20}{column.iloc[i] : <500}")
    
#     if base_row !=None:
#         print ("Using index " + str(base_row) + " as base:") #fix one index as base
#         print(f"{'Index' : <10}{'Similarity Score' : <20}{'Text' : <500}")
#         for i in range(len(column)): #compare base_row with other index
#             if similarity_matrix.iloc[base_row,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#                 print(f"{i : <10}{round(similarity_matrix.iloc[base_row,i],4) : <20}{column.iloc[i] : <500}")
                

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

# def feature_extraction(column,ngram_range,ascending,fe_type):
#     """
#     Feature extraction methods - Bag of words or TF-IDF
    
#     params:
#     column [series]: column to select
#     ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
#                                        - [default] ngram_range of (1, 1) means only unigrams, 
#                                        - ngram_range of (1, 2) means unigrams and bigrams, 
#                                        - ngram_range of (2, 2) means only bigram
#     ascending [True/False/None]: - None (words arranged in alphabetical order)
#                                  - True(words arranged in ascending order of sum), 
#                                  - False(words arranged in descending order of sum)                               
#     fe_type[string]: Feature extraction type: Choose "bagofwords" or "tfidf" method
#     """
#     if ngram_range == None:
#         ngram_range=(1,1)
    
#     if fe_type == "bagofwords":
#         vec_type = CountVectorizer(ngram_range=ngram_range, analyzer='word')
#         vectorized = vec_type.fit_transform(column)
#         df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
#         df.loc['sum'] = df.sum(axis=0).astype(int)

#     if fe_type == "tfidf":
#         vec_type = TfidfVectorizer(ngram_range=ngram_range, analyzer='word')
#         vectorized = vec_type.fit_transform(column)
#         df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
#         df.loc['sum'] = df.sum(axis=0)
    
#     if ascending != None:
            
#         df = df.sort_values(by ='sum', axis = 1,ascending=ascending)
    
    
#     return df,vec_type,vectorized

In [None]:
# cols = ['title_clean', 'desc_clean']
# df['combined'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
# df

In [None]:
# from configparser import ConfigParser

# # instantiate
# config = ConfigParser()

# # parse ini file
# ini_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/"
# config.read(ini_path+'default.ini')

# # read values 
# #from data loading section
# path = config.get('dataloading', 'path')

# #from data preprocessing section


# # from ML module section
# #Unsupervised
# #Supervised
# #Similarity metrics
