### Data Loading

In [1]:
#user input file path
path = 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/'

In [None]:
import os
os.listdir(path)

In [2]:
def data_loading(path,start_date=None,stop_date=None):
    '''
    Load only files that follow agreed filename format, merge files as single dataframe.
    User can choose to 
    a) Load all json files following the agreed filename format
    b) Load only json files from specific dates by adding the start and stop dates (Note: Both start_date and
    stop_date must be used together)
    
    params:
    path [string]: path of the files, without filename
    
    start_date[None/string in YYYY-MM-DD format](optional,default is None): 
    User can choose to load files starting from start_date
    - None: no start_date is provided, all files are loaded
    - string in YYYY-MM-DD format: files starting from start_date will be loaded
    
    stop_date[None/string in YYYY-MM-DD format](optional,default is None): 
    User can choose to load files until stop_date
    - None: no stop_date is provided, all files are loaded
    - string in YYYY-MM-DD format: files until stop_date will be loaded
    '''
    from datetime import datetime,timedelta
    import pandas as pd
    import glob, os, json
    import re

    filenames = os.listdir(path)
    file_list=[]
    date_list = []
    df = pd.DataFrame()
    
    if start_date == None and stop_date == None :
        for file in filenames:
            # search agreed file format pattern in the filename

            pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"

            match = re.search(pattern,file)
                
            #if match is found
            if match:
                pattern = os.path.join(path, file) #join path with file name
                file_list.append(pattern) #list of json files that follow the agreed filename
            
        print("Files read:",file_list)                   
        for file in file_list:
            with open(file) as f:
                #flatten json into pd dataframe
                json_data = pd.json_normalize(json.loads(f.read()))
                json_data = pd.DataFrame(json_data)
                #label which file each row is from 
                json_data['file'] = file.rsplit("/", 1)[-1]

            df = df.append(json_data)              
                
    else:
        #convert start and stop string to datetime
        start = datetime.strptime(start_date, "%Y-%m-%d").date()
        stop = datetime.strptime(stop_date, "%Y-%m-%d").date()
    
        #iterate from start to stop dates by day and store dates in list
        while start <= stop:
            date_list.append(start)
            start = start + timedelta(days=1)  # increase day one by one

        #convert datetime objects to string
        string_list =[d.strftime("%Y-%m-%d") for d in date_list]
#         print(string_list)
        
        for file in filenames: 
            
            # search agreed file format pattern in the filename
            for date in string_list: 
                pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
        
                match = re.search(pattern,file)
                
                #if match is found
                if match:
                    pattern = os.path.join(path, file) #join path with file name
                    file_list.append(pattern) #list of json files that follow the agreed filename

        print("Files read:",file_list)     
        for file in file_list:
            with open(file) as f:
                #flatten json into pd dataframe
                json_data = pd.json_normalize(json.loads(f.read()))
                json_data = pd.DataFrame(json_data)
                #label which file each row is from 
                json_data['file'] = file.rsplit("/", 1)[-1]

            df = df.append(json_data)

    return df

In [None]:
df = data_loading(path,start_date = "2021-08-25",stop_date = "2021-08-25")
df

In [3]:
df = data_loading(path,start_date = None,stop_date = None)
df

Files read: ['C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-08-25)1_firstSet_1.json', 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-08-25)3_secondSet_1.json', 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/(2021-10-11)3_secondSet_1.json']


Unnamed: 0,id,title,description,comments,updated_date,hierarchy_id,rev,tenant,subject,is_current,hierarchy_path,parent_id,record_type,row_num,file
0,1308651592,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",2021-07-21 12:30:31.387,,8,iot_platf,support,1,/1201559436/1208431055/1308651592/,1208431055,parent,1,(2021-08-25)1_firstSet_1.json
1,1308671310,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,2021-05-04 09:30:00.320,,11,iot_platf,support,1,/1201559436/1208431055/1308671310/,1208431055,parent,2,(2021-08-25)1_firstSet_1.json
2,1308673361,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,2021-05-20 11:47:18.927,,9,iot_platf,support,1,/1201559436/1208431055/1308673361/,1208431055,parent,3,(2021-08-25)1_firstSet_1.json
3,1507656633,[Testing Only] this is enhancement only,Retest some function again.,,2020-03-13 10:16:18.703,,31,iot_platf,support,1,/1201559436/1208431055/1507656633/,1208431055,parent,4,(2021-08-25)1_firstSet_1.json
4,1507656638,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,2020-06-01 09:49:55.913,,19,iot_platf,support,1,/1201559436/1208431055/1507656638/,1208431055,parent,5,(2021-08-25)1_firstSet_1.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,22012641037,,"<div><span style=""font-size: 12.18px;"">Hello,&...",,2021-03-26 13:19:20.430,,11,iot_platf,support,1,/1201559436/1208431055/22012641037/,1208431055,parent,900,(2021-10-11)3_secondSet_1.json
900,22012645565,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-05-20 13:03:09.327,,11,iot_platf,support,1,/1201559436/1208431055/22012645565/,1208431055,parent,901,(2021-10-11)3_secondSet_1.json
901,22012704243,,<div>The schedule test suite allow for the use...,,2021-04-26 10:04:12.410,,9,iot_platf,support,1,/1201559436/1208431055/22012704243/,1208431055,parent,902,(2021-10-11)3_secondSet_1.json
902,22012765885,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",,2021-06-30 00:35:58.927,,14,iot_platf,support,1,/1201559436/1208431055/22012765885/,1208431055,parent,903,(2021-10-11)3_secondSet_1.json


### Data Pre-processing

### a) Dataframe manipulation

In [4]:
def df_manipulation(df,how,keep="first",cols_tokeep=None,cols_todrop=None,impute_value=None,subset=None):
    """
    1) Column selection: Keep or drop columns in dataframe
    2) Data impute: Impute or drop NA rows 
    3) Data duplication cleaning: Drop all duplicates or drop all duplicates except for the first/last occurrence
    
    params:
    df [dataframe]: input dataframe 
    how[string]: Drop rows when we have at least one NA or all NA. Choose
                      # - "all": Drop row with all NA
                      # - "any": Drop row with at least one NA
    keep[string/False]: Choose to drop all duplicates or drop all duplicates except for the first/last occurrence
                      # - None[DEFAULT] : Drop duplicates except for the first occurrence. 
                      # - "last" : Drop duplicates except for the last occurrence. 
                      # - False : Drop all duplicates.
    cols_tokeep [list/None][DEFAULT]: list of columns to keep, if there is no list use None 
    cols_todrop [list/None]: list of columns to drop, if there is no list use None 
    impute_value [string/None]: value to be imputed (i.e "" for empty string). If no value to be imputed but there are 
                        rows to be dropped use None
                  
    subset[list/None]: Subset of columns for dropping NA and identifying duplicates, use None if no column to select
   
    """
    
    print("Shape of df before manipulation:",df.shape)

    #Column selection - user can select columns or drop unwanted columns
    if cols_tokeep != None:
        df = df[cols_tokeep]
    if cols_todrop != None:
        df = df.drop(cols_todrop,axis=1)
    print("Shape of df after selecting columns:",df.shape)

    #---Data impute - user can impute or drop rows with NA,freq of null values before & after manipulation returned---#
    print("Number of null values in df:\n",df.isnull().sum())
  

    # impute NA values with user's choice of imputation value
    if impute_value != None:
        df = df.fillna(impute_value)
        print("Number of null values in df after NA imputation:\n",df.isnull().sum())
        
    else: # drop rows with NA values
        df= df.dropna(axis=0, how=how,subset=subset)
        print("Number of null values in df after dropping NA rows:\n",df.isnull().sum())
        print("Shape of df after dropping NA rows:",df.shape)

    #---------Data duplication cleaning--------#
    print("Number of duplicates in the df:", df.duplicated().sum())

    #drop duplicates
    if keep == None:
        keep = "first"
        
    df = df.drop_duplicates(subset=subset, keep=keep)

    print("Shape of df after manipulation:",df.shape)

    return df


In [5]:
# df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description"],cols_todrop=None,impute_value=None,subset=None)
df.head()

Shape of df before manipulation: (2712, 15)
Shape of df after selecting columns: (2712, 2)
Number of null values in df:
 title          1808
description       0
dtype: int64
Number of null values in df after dropping NA rows:
 title          0
description    0
dtype: int64
Shape of df after dropping NA rows: (904, 2)
Number of duplicates in the df: 0
Shape of df after manipulation: (904, 2)


Unnamed: 0,title,description
0,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...
1,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...
2,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...
3,[Testing Only] this is enhancement only,Retest some function again.
4,[Testing Only] this is consultation only,enter the support needed at here ...



### b) Text Normalization

### 2) Expand contractions 

In [6]:
import contractions

def word_contractions(text):
    """
    Expand word contractions (i.e. "isn't" to "is not")
    params:
    text[string]: input string 
    """
    return " ".join([contractions.fix(word) for word in text.split()])   


In [None]:
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["desc_cont"]=  [word_contractions(text) for text in df["description"]]
df["comments_cont"]=  [word_contractions(text) for text in df["comments"]]
df.head()

In [None]:
df.iloc[149,1]

In [None]:
df.iloc[149,4]

### 3) Convert all characters into lowercase 

In [7]:
def lowercase(text):
    """
    Convert all characters to lower case
    param:
    text[string]: input string 
    """
    return text.lower() if type(text) == str else text
    

In [None]:
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["desc_lower"]= [lowercase(text) for text in df["desc_cont"]]
df["comments_lower"]= [lowercase(text) for text in df["comments_cont"]]
df.head()

In [None]:
# df = df[["title_lower","desc_lower","comments_lower"]]


### 4) Stemming/Lemmatization

### Stemming

In [None]:
df= df[["title_rare","desc_rare","comments_rare"]]

In [8]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

def stem_words(text,stemmer_type=None):
    """
    Stemming words. Default option is Porter Stemmer, alternative option is Lancaster Stemmer 
    params:
    text[string]: input string 
    stemmer_type[None/string]: input stemming method 
                                - None for Porter Stemmer
                                - "Lancaster" for Lancaster Stemmer 
    """
    if stemmer_type == None:
        stemmer = PorterStemmer()
    if stemmer_type == "Lancaster":
        stemmer=LancasterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])
    

In [None]:
df1 = df.copy()

In [None]:
df1["title_stem_por"] = [stem_words(text,stemmer_type=None) for text in df1["title_rare"]]
df1["desc_stem_por"] = [stem_words(text,stemmer_type=None) for text in df1["desc_rare"]]
df1["comments_stem_por"]= [stem_words(text,stemmer_type=None) for text in df1["comments_rare"]]
df1.head()

In [None]:
df1["title_stem_lan"] = [stem_words(text,stemmer_type = "Lancaster") for text in df1["title_rare"]]
df1["desc_stem_lan"] = [stem_words(text,stemmer_type = "Lancaster") for text in df1["desc_rare"]]
df1["comments_stem_lan"]= [stem_words(text,stemmer_type = "Lancaster") for text in df1["comments_rare"]]
df1.head()

### Lemmatization

In [None]:
df2 = df.copy()
df2.head()

In [None]:
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_words(column,lemma_type=None):
    """
    Lemmatize words: Default option is WordNetLemmatizer, alternative option is Spacy 
    params:
    column[series]: input series/column to be lemmatized
    lemma_type[None/string]: input lemmatization method
                            - None for WordNetLemmatizer
                            - "Spacy" for Spacy    
    """
    if lemma_type == None:
     
        lemmatizer = WordNetLemmatizer()
        return column.apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))
   
    
    if lemma_type == "Spacy":
        nlp = spacy.load("en_core_web_sm")
        column = column.apply(lambda text: " ".join([w.lemma_ for w in nlp(text)]))
        #convert to lower case as spacy will convert pronouns to upper case
        column = column.apply(lambda text: text.lower() if type(text) == str else text )
        
        return column
        


In [None]:
#Spacy
df2["title_lemma_spacy"] = lemmatize_words(column= df2["title_rare"],lemma_type="Spacy")
df2["desc_lemma_spacy"] = lemmatize_words(column= df2["desc_rare"],lemma_type="Spacy")
df2["comments_lemma_spacy"] = lemmatize_words(column= df2["comments_rare"],lemma_type="Spacy")
df2

In [None]:
#WordNetLemmatizer
df2["title_lemma_word"] = lemmatize_words(column= df2["title_rare"],lemma_type=None)
df2["desc_lemma_word"] = lemmatize_words(column= df2["desc_rare"],lemma_type=None)
df2["comments_lemma_word"] = lemmatize_words(column= df2["comments_rare"],lemma_type=None)
df2

### b) Noise filtering



### 1) Remove html tag and url

In [8]:
from bs4 import BeautifulSoup
import re
def remove_htmltag_url(text):
    """
    Remove html tag and url
    params:
    text [string]: input string
    
    """
    import pandas as pd
    pd.options.mode.chained_assignment = None 
    #remove html tag
    text = BeautifulSoup(text, 'html.parser').get_text(separator= " ",strip=True) 
    #remove url
    text_clean = re.sub('https?[://%]*\S+', ' ',text) 
    return text_clean 

In [None]:
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["desc_tag"]= [remove_htmltag_url(text) for text in df["desc_lower"]]
df["comments_tag"]= [remove_htmltag_url(text) for text in df["comments_lower"]]
df.head()

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[10,4]

### 3) Remove irrelevant characters, punctuation, special characters

In [None]:
df = df[["title_tag","desc_tag","comments_tag"]]

In [9]:
import re
def remove_irrchar_punc(text,char=None):
    """
    Remove irrelevant characters and punctuation. Optional: User can specify special characters to be removed in regex
    format.    
    params:    
    text[string]: input string 
    characters[string]: input regex of characters to be removed
    """
    if char != None:
        #Remove special characters given by user
        text = re.sub(char, ' ',text) 
    
    # Remove utf-8 literals (i.e. \\xe2\\x80\\x8)
    text = re.sub(r'\\+x[\d\D][\d\D]', ' ',text) 
    
    #Remove special characters and punctuation
    text = re.sub('[^\w\s]', ' ',text) 
    text = re.sub(r'_', ' ',text) 
   
    return text


In [None]:
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["desc_rem"]= [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["comments_rem"]= [remove_irrchar_punc(text,char=None) for text in df["comments_tag"]]
df.head()

In [None]:
df.iloc[10,1] #desc before rem

In [None]:
df.iloc[10,4] #desc rem

In [None]:
#special character removal added by user
char = '\++\d+'
df["title_rem"] = [remove_irrchar_punc(text,char=char) for text in df["title_tag"]]
df["desc_rem"]= [remove_irrchar_punc(text,char=char) for text in df["desc_tag"]]
df["comments_rem"]= [remove_irrchar_punc(text,char=char) for text in df["comments_tag"]]
df.head()

### 3) Remove numeric data

In [None]:
df = df[["title_rem","desc_rem","comments_rem"]]

In [10]:
def remove_num(text):
    """
    Remove numeric data
    params:
    text[string]: input string 
    
    """
    text = re.sub('\d+', ' ',text) 

    return text

In [None]:
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["desc_num"]= [remove_num(text) for text in df["desc_rem"]]
df["comments_num"]= [remove_num(text) for text in df["comments_rem"]]
df.head()

### 4) Remove multiple whitespaces

In [None]:
df = df[["title_num","desc_num","comments_num"]]

In [11]:
def remove_multwhitespace(text):
    """
    Remove multiple white spaces
    params:
    text[string]: input string 
    
    """
    text = re.sub(' +', ' ',text) 
    
    return text

In [None]:
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["desc_white"]= [remove_multwhitespace(text) for text in df["desc_num"]]
df["comments_white"]= [remove_multwhitespace(text) for text in df["comments_num"]]
df.head()

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[10,4]

### 4) Remove stopwords

In [None]:
df = df[["title_white","desc_white","comments_white"]]

In [12]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(text,extra_sw=None,remove_sw=None):
    """
    Removes English stopwords. Optional: user can add own stopwords or remove words from English stopwords  
    params:
    text[string]: input string
    extra_sw [list] (optional): list of words/phrase to be added to the stop words 
    remove_sw [list] (optional): list of words to be removed from the stop words 
    """
    all_stopwords = stopwords.words('english')
    
    #default list of stopwords
    if extra_sw == None and remove_sw==None:
        all_stopwords = all_stopwords
        
    # add more stopwords
    elif remove_sw == None:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        
    # remove stopwords from existing sw list
    elif extra_sw == None:
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
        
    # remove and add stopwords to existing sw list
    else:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
         
  
    for w in all_stopwords:
        pattern = r'\b'+w+r'\b'
        text = re.sub(pattern,' ', text)
                   
    return text 

In [None]:
#list of words/phrase to be added to the stop words 
# extra_sw = ['hsdes',"testing"]
#list of words/phrase to be removed from stop words
# remove_sw = ["i","am"]

df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
df["comments_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["comments_white"]]
df.head()

### 5) Remove frequent words

In [None]:
df = df[["title_stop","desc_stop","comments_stop"]]

In [14]:
def remove_freqwords(column,n):
    """
    Remove n frequent words
    params:
    column[series]: input column to remove frequent words
    n [integer]: input number of frequent words to be removed
    """
    from collections import Counter
    cnt = Counter()
    
    for text in column.values:
        for word in text.split():
            cnt[word] += 1
           
    #custom function to remove the frequent words             
    FREQWORDS = set([w for (w, wc) in cnt.most_common(n)])
    
    print("Frequent words that are removed from column:", set([(w, wc) for (w, wc) in cnt.most_common(n)]))
    
    return column.apply(lambda text: " ".join([word for word in str(text).split() if word not in FREQWORDS]))



In [None]:
n=10
df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["comments_freq"] = remove_freqwords(df["comments_stop"],n)
df.head()

In [None]:
df.iloc[2,0]

In [None]:
df.iloc[2,3]

### 6) Remove rare words

In [15]:
def remove_rarewords(column,n):
    """
    Remove n rare words
    params:
    column[series]: input column to remove rare words
    n [integer]: input number of rare words to be removed
    """
    from collections import Counter
    cnt = Counter()
    
    for text in column.values:
        for word in text.split():
            cnt[word] += 1
           
    #custom function to remove the rare words             
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n-1:-1]])
    
    print("Rare words that are removed from column:", set([(w,wc) for (w, wc) in cnt.most_common()[:-n-1:-1]]))
        
    return column.apply(lambda text: " ".join([word for word in str(text).split() if word not in RAREWORDS]))


In [None]:
n=10
df["title_rare"] = remove_rarewords(df["title_freq"],n)
df["desc_rare"] = remove_rarewords(df["desc_stop"],n)
df["comments_rare"] = remove_rarewords(df["comments_stop"],n)
df.head()

In [None]:
df.iloc[903,1] #converting is rare word

In [None]:
df.iloc[903,7]

### c) Custom tokenization

In [None]:
def cust_tokenization_split(column,delim =None):
    """
    Custom tokenization using split() 
    params:
    column[series]: input column           
    delim[None/string],default delimiter (delim=None) is whitespace: specify delimiter to separate strings
                        - None: delimiter is white space
                        - string: delimiter is the string specified       
    """
    
    if delim==None:
        print("Text is split by whitespace") #default delimiter is space if not specified 

    else:
        print("Text is split by:", delim) #can accept one or more delimiter

    return column.apply(lambda text: text.split() if delim==None else text.split(delim))


In [None]:
#use split
df["title_token"]= cust_tokenization_split(column = df["title_rare"],delim= None) 
df["desc_token"]= cust_tokenization_split(column = df["desc_rare"],delim= None)
df["comments_token"]= cust_tokenization_split(column = df["comments_rare"],delim= None)
df

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer

def cust_tokenization_nltk(column,token_type):
    """
    Custom tokenization using NLTK 
    params:
    column[series]: input column 
    token_type["string"]: type of nltk tokenization
    a) token_type = "WordToken" tokenizes a string into a list of words
    b) token_type = "SentToken" tokenizes a string containing sentences into a list of sentences
    c) token_type = "WhiteSpaceToken" tokenizes a string on whitespace (space, tab, newline)
    d) token_type = "WordPunctTokenizer" tokenizes a string on punctuations
    """
    if token_type == "WordToken":
        tokenizer = word_tokenize
    if token_type == "SentToken":
        tokenizer = sent_tokenize
    if token_type == "WhiteSpaceToken":
        tokenizer = WhitespaceTokenizer().tokenize
    if token_type == "WordPunctTokenizer":
        tokenizer = WordPunctTokenizer().tokenize

    return column.apply(lambda text: tokenizer(text))

In [None]:
#use nltk
df["title_token"]= cust_tokenization_nltk(column = df["title_rare"],token_type= "WordToken") 
df["desc_token"]= cust_tokenization_nltk(column = df["desc_rare"],token_type="WordToken")
df["comments_token"]= cust_tokenization_nltk(column = df["comments_rare"],token_type= "WordToken")
df

## d) Custom taxonomy

### i) Configurability for user to provide taxonomy mapping (to remove/remain)

In [None]:
df = df[["title_rare","desc_rare","comments_rare"]]
df.head()

In [18]:
def custom_taxo(df,remove_taxo,include_taxo):
    """
    User provides taxonomy to be removed or remained in the text. 
    a) user wants to remove taxonomies only -> input a list of taxonomies to be removed in remove_taxo 
    b) user wants to remove taxonomies but wants the same taxonomy to remain in certain phrases 
    (i.e remove taxo "test" but  "test" remains in "test cycle") -> input a list of taxonomies to be removed in remove_taxo and list of
    phrases for the taxonomy to remain in include_taxo
    
    params:
    df [dataframe]: input dataframe
    remove_taxo[list/regex]: list of taxonomy to be removed from text
    include_taxo[list/None]: list of taxonomy to be maintained in text
    """
    import re
    import pandas as pd 
    
    def convert(text,remove_taxo):  
        """
        Uses regex given in remove_taxo to find and return all matches 
        """
        match = re.findall(remove_taxo,text)
        if match:                 
            new_row = {'Match':match}
            return(new_row)
        
    #if remove_taxo is regex call convert function to get all matches as a list
    if type(remove_taxo) == str: 
        cv_list = []
        for i in range(len(df.columns)):
            for text in df.iloc[:,i]:
                cv = convert(text,remove_taxo)
                if cv:
                    cv_list.append(cv)
        #             print(cv_list)

        cv_df = pd.DataFrame(cv_list)
        remove_taxo = list(cv_df["Match"].apply(pd.Series).stack().unique())
        print("Remove_taxo_list:", remove_taxo)
        
    def taxo(text,remove_taxo,include_taxo): 
        if remove_taxo != None and include_taxo != None: #user wants to remove taxonomies but wants the same taxonomy to remain in certain phrases (i.e remove "test" but remain "test" in "test cyccle")

            for w in remove_taxo:
            #row without any item from include_taxo -> replace all remove_taxo items with empty string
                if all(phrase not in text for phrase in include_taxo): 
                    pattern = r'\b'+w+r'\b'
                    text = re.sub(pattern,' ', text) 
                #row with any item from include_taxo -> only replace remove_taxo item that is not in include_taxo
                else: 
                    if all(w not in phrase for phrase in include_taxo):
                        pattern = r'\b'+w+r'\b'
                        text = re.sub(pattern,' ', text) 
                        
        if remove_taxo != None and include_taxo == None: #user wants to remove taxonomies only:
            for w in remove_taxo: #remove_taxo in list of words
                pattern = r'\b'+w+r'\b'
                text = re.sub(pattern,' ', text)
                 
        return text 
    
    
    df = df.applymap(lambda text: taxo(text,remove_taxo,include_taxo))     
    df = df.add_suffix('_taxo')
                
    return df    

In [19]:
custom_taxo(df1,remove_taxo = r'test \w+',include_taxo=["test suite execution"])

Remove_taxo_list: ['test suite', 'test cycle', 'test execution', 'test case', 'test entry', 'test result', 'test cases', 'test schedule', 'test build', 'test recipe', 'test to', 'test schedular', 'test stuck', 'test tools', 'test components', 'test sut', 'test cycles', 'test procedure', 'test occasionally', 'test report', 'test please', 'test group', 'test page', 'test script', 'test component', 'test id', 'test excution', 'test never', 'test sst', 'test automation', 'test results', 'test type', 'test did', 'test center', 'test run', 'test in', 'test reporting', 'test list', 'test completed', 'test ended', 'test as', 'test show', 'test is', 'test python', 'test request', 'test which', 'test customization', 'test and', 'test planning', 'test after', 'test configuration', 'test through', 'test gio', 'test modification', 'test client', 'test test', 'test passing', 'test executed', 'test host', 'test command', 'test only', 'test project', 'test hang', 'test config', 'test groups', 'test sk

Unnamed: 0,title_clean_taxo,desc_clean_taxo
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...
1,test suite execution terminates before executi...,test suite execution finished before executing...
2,cloning defects from another is not working,i am trying to clone defects from another i ...
3,testing only this is enhancement only,retest some function again
4,testing only this is consultation only,enter the support needed at here
...,...,...
899,import gc time global domain artifact in gio f...,hello please import time global domain time kp...
900,kpi metric phase extract kpi metric trend acr...,hi gio team thank you for providing kpi metric...
901,ability to clone schedule from other programs,the schedule allow for the user to clone w...
902,kpi metric enhance kpi feature to plot graphs...,hi gio team thank you for providing kpi featur...


In [20]:
custom_taxo(df1,remove_taxo = r'test \w+',include_taxo=None)

Remove_taxo_list: ['test suite', 'test cycle', 'test execution', 'test case', 'test entry', 'test result', 'test cases', 'test schedule', 'test build', 'test recipe', 'test to', 'test schedular', 'test stuck', 'test tools', 'test components', 'test sut', 'test cycles', 'test procedure', 'test occasionally', 'test report', 'test please', 'test group', 'test page', 'test script', 'test component', 'test id', 'test excution', 'test never', 'test sst', 'test automation', 'test results', 'test type', 'test did', 'test center', 'test run', 'test in', 'test reporting', 'test list', 'test completed', 'test ended', 'test as', 'test show', 'test is', 'test python', 'test request', 'test which', 'test customization', 'test and', 'test planning', 'test after', 'test configuration', 'test through', 'test gio', 'test modification', 'test client', 'test test', 'test passing', 'test executed', 'test host', 'test command', 'test only', 'test project', 'test hang', 'test config', 'test groups', 'test sk

Unnamed: 0,title_clean_taxo,desc_clean_taxo
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...
1,execution terminates before executing all tests,execution finished before executing all test...
2,cloning defects from another is not working,i am trying to clone defects from another i ...
3,testing only this is enhancement only,retest some function again
4,testing only this is consultation only,enter the support needed at here
...,...,...
899,import gc time global domain artifact in gio f...,hello please import time global domain time kp...
900,kpi metric phase extract kpi metric trend acr...,hi gio team thank you for providing kpi metric...
901,ability to clone schedule from other programs,the schedule allow for the user to clone w...
902,kpi metric enhance kpi feature to plot graphs...,hi gio team thank you for providing kpi featur...


In [15]:
df1=df.copy()
# df1 = df1[["title","description"]]
df1.head()

Unnamed: 0,title,description
0,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...
1,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...
2,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...
3,[Testing Only] this is enhancement only,Retest some function again.
4,[Testing Only] this is consultation only,enter the support needed at here ...


In [16]:
df1["desc_cont"] = [word_contractions(text) for text in df1["description"]]
df1["desc_lower"] = [lowercase(text) for text in df1["desc_cont"]]
df1["desc_tag"] = [remove_htmltag_url(text) for text in df1["desc_lower"]]
df1["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df1["desc_tag"]]
df1["desc_num"] = [remove_num(text) for text in df1["desc_rem"]]
df1["desc_clean"] = [remove_multwhitespace(text) for text in df1["desc_num"]]

df1["title_cont"] = [word_contractions(text) for text in df1["title"]]
df1["title_lower"] = [lowercase(text) for text in df1["title_cont"]]
df1["title_tag"] = [remove_htmltag_url(text) for text in df1["title_lower"]]
df1["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df1["title_tag"]]
df1["title_num"] = [remove_num(text) for text in df1["title_rem"]]
df1["title_clean"] = [remove_multwhitespace(text) for text in df1["title_num"]]



In [17]:
df1 = df1[["title_clean","desc_clean"]]
df1.head()

Unnamed: 0,title_clean,desc_clean
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...
1,test suite execution terminates before executi...,test suite execution finished before executing...
2,cloning defects from another test cycle is not...,i am trying to clone defects from another test...
3,testing only this is enhancement only,retest some function again
4,testing only this is consultation only,enter the support needed at here


In [None]:
#list of words to remove
# remove_taxo = ["gio","fields","test"]
# #list of words to maintain
# include_taxo = ["test suite execution","kpi metric"]
# df1["desc_taxo"]=  [custom_taxo(text,remove_taxo,include_taxo) for text in df1["desc_white"]]


### ii) Custom Named Entity Recognition (Methodology to recommend potential taxonomy)
1) User to split text data into train, validation, test

2) User to create custom entity data for the train and validation

3) User to get base_config.cfg file from Spacy website and save in same path as jupyter notebook

4) Function will 

    i) convert data into .spacy format 
    
    ii) build/save NER model in given path or load previously built NER model
    
    iii) Label entities in test data to recommend potential taxonomy to user


In [None]:
# df = df[["title_stop","desc_stop","comments_stop"]]

In [None]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import numpy as np

def convert_spacy(DATA):
    """
    Convert  data into .spacy format
    DATA[]: Train/validation data to be converted to .spacy format
    """
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
        
    return db

    
def custom_ner(TRAIN_DATA,VAL_DATA,path):
    """
    Build and save custom NER model in given path. 
    
    """
    #convert train and validation data into .spacy format
    db_train = convert_spacy(TRAIN_DATA) 
    db_val = convert_spacy(VAL_DATA) 
    
    #save train and validation data in .spacy format in path
    db_train.to_disk(path +'train.spacy')
    db_val.to_disk(path +'val.spacy')
    
    print("Train and validation converted to .spacy format and saved")
    
    #autofill base_config file saved by user from spacy website
    !python -m spacy init fill-config base_config.cfg config.cfg
    
    #Model building and saving in path
    !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy
    
    print("Custom NER model built and saved!")
    
def check_ents(path,column):
    """
    Check entities after loading best model
    
    """
    #Load best model
    nlp = spacy.load(path + "/output/model-best/")     
    print("Best model loaded!")
    
    entities = []
    for text in column.tolist():
        doc = nlp(text)
        for ent in doc.ents:
            entities.append(ent.text+' - '+ent.label_)
    print(np.unique(np.array(entities)))        

def ner_wrapper(TRAIN_DATA,VAL_DATA,path,column,train_model):  
    """
    User can choose to train the spacy model or load spacy model
    params:
    TRAIN_DATA[NER format]: train data for model building
    VAL_DATA[NER format]: validation data for model building
    path[string]: input path to store model. Path has to be the same as base_config.cfg file downloaded from spacy
                  website and jupyter notebook.
    column[series]: column for entities to be checked
    train_model[True/False]: True if want to train model. False to load model (no training)
    """
    if train_model == True:
        custom_ner(TRAIN_DATA,VAL_DATA,path)
        check_ents(path,column)
        
    if train_model == False:
        check_ents(path,column)

In [None]:
#custom entity data for the train and validation
TRAIN_DATA = [
["jchun wai kit is working on this to enable in new tcp", {"entities": [[0, 13, "NAME"]]}], 
["siewlita pending release", {"entities": [[0, 8, "NAME"]]}],
["hi lim chih quanx per our communication i still have one more question", {"entities": [[3, 17, "NAME"]]}],
["yeetheng the auto test trigger after build complete is working fine today", {"entities": [[0, 8, "NAME"]]}],
["hi jon here is the recipe link weichuan hi can you try to reproduce the issue once more", {"entities": [[3, 6, "NAME"],[31, 39, "NAME"]]}]
]

VAL_DATA = [
["wei chuan has updated me with the sample of test execution by automation manual chart", {"entities": [[0, 9, "NAME"]]}],
["subject gio logs and gio installation hi ajay jonathan i just noticed that star is directing all the logs to gio folder", {"entities": [[41, 45, "NAME"],[46, 55, "NAME"]]}],
["hi firesh final verdict in jenkins coming as fail even after all the triggered tests are passed", {"entities": [[3, 9, "NAME"],[27, 35, "NAME"]]}],
["wai kit below is the requirement needed from gio product defect detection", {"entities": [[0, 7, "NAME"]]}],
["just string field regards robert nowicki", {"entities": [[26, 40, "NAME"]]}]
]

#jupyter notebook and base_config.cfg path have to be the same
path = "C:/Users/nchong/"

#load and clean test data
df_test = pd.read_excel("C:/Users/nchong/test.xlsx",index_col=0)
df_test = df_manipulation(df_test,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)



In [None]:
ner_wrapper(TRAIN_DATA,VAL_DATA,path,column=df_test["comments"],train_model=True)

In [None]:
ner_wrapper(TRAIN_DATA,VAL_DATA,path,column=df_test["comments"],train_model=False)

### Feature extraction

In [None]:
# data preprocessing
#title
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
n=10
df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["title_rare"] = remove_rarewords(df["title_freq"],n)
df["title_lemma"] = lemmatize_words(column= df["title_rare"],lemma_type=None)
df["title_clean"] = df["title_lemma"]

#description
df["desc_cont"] = [word_contractions(text) for text in df["description"]]
df["desc_lower"] = [lowercase(text) for text in df["desc_cont"]]
df["desc_tag"] = [remove_htmltag_url(text) for text in df["desc_lower"]]
df["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["desc_num"] = [remove_num(text) for text in df["desc_rem"]]
df["desc_white"] = [remove_multwhitespace(text) for text in df["desc_num"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
n=10
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["desc_rare"] = remove_rarewords(df["desc_freq"],n)
df["desc_lemma"] = lemmatize_words(column= df["desc_rare"],lemma_type=None)
df["desc_clean"] = df["desc_lemma"]

df.head()

In [None]:
df = df[["title_clean","desc_clean"]]
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def feature_extraction(column,ngram_range=None,ascending=None,fe_type=None):
    """
    Feature extraction methods - TF-IDF(default choice) or Bag of words
     
    params:
    column [series/DataFrame]: column selected for feature extraction 
                        - series: only one column is selected for feature extraction (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for feature extraction (e.g. df[["title_clean","desc_clean"]])
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - [default] ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    ascending [True/False/None]: - [default] None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)                               
    fe_type[string/None]: Feature extraction type: Choose "bagofwords" for bow or None for default tfidf method
    
    """
    if type(column) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        column = column.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
                    
    if ngram_range == None: #set ngram range as unigram by default
        ngram_range=(1,1)
        
    if fe_type == "bagofwords":
        vec_type = CountVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0).astype(int)

    if fe_type == None: #tfidf
        vec_type = TfidfVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0)
    
    if ascending != None:
            
        df = df.sort_values(by ='sum', axis = 1,ascending=ascending)
    
    
    return df,vec_type,vectorized

In [None]:
feature_extraction(column=df[["title_clean","desc_clean"]],ngram_range=None,ascending=False,fe_type=None)[0]

In [None]:
feature_extraction(column=df["title_clean"],ngram_range=None,ascending=False,fe_type=None)[0]

In [None]:
df

In [None]:
df1 = df.drop(["id"],axis=1)
df1

In [None]:
#description
df1["desc_cont"] = [word_contractions(text) for text in df1["description"]]
df1["desc_lower"] = [lowercase(text) for text in df1["desc_cont"]]
df1["desc_tag"] = [remove_htmltag_url(text) for text in df1["desc_lower"]]
df1["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df1["desc_tag"]]
df1["desc_num"] = [remove_num(text) for text in df1["desc_rem"]]
df1["desc_white"] = [remove_multwhitespace(text) for text in df1["desc_num"]]
#list of words to remove
remove_taxo = ["gio","fields","test"]
#list of words to maintain
include_taxo = ["test suite execution","kpi metric"]
df1["desc_taxo"]=  [custom_taxo(text,remove_taxo,include_taxo) for text in df1["desc_white"]]

# df1["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df1["desc_white"]]
# n=10
# df1["desc_freq"] = remove_freqwords(df1["desc_stop"],n)
# df1["desc_rare"] = remove_rarewords(df1["desc_freq"],n)
# df1["desc_lemma"] = lemmatize_words(column= df1["desc_rare"],lemma_type=None)
# df1["desc_clean"] = df1["desc_lemma"]

df1.head()

### Unsupervised Learning
### i ) K-means clustering

In [None]:
df1 = df.copy()
df1.head()

In [None]:
df1 = df_manipulation(df1,how="any",keep="first",cols_tokeep=["title"],cols_todrop=None,impute_value=None,subset=["title"])
df1["title_cont"] = [word_contractions(text) for text in df1["title"]]
df1["title_lower"] = [lowercase(text) for text in df1["title_cont"]]
df1["title_tag"] = [remove_htmltag_url(text) for text in df1["title_lower"]]
df1["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df1["title_tag"]]
df1["title_num"] = [remove_num(text) for text in df1["title_rem"]]
df1["title_white"] = [remove_multwhitespace(text) for text in df1["title_num"]]
df1["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df1["title_white"]]
n=10

df1["title_freq"] = remove_freqwords(df1["title_stop"],n)
df1["title_rare"] = remove_rarewords(df1["title_freq"],n)
df1["title_lemma_word"] = lemmatize_words(column= df1["title_rare"],lemma_type=None)
df1["title_clean"] = df1["title_lemma_word"]
df1.head()

In [None]:
df1 = df1[["title_clean"]]

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
import pandas as pd

silhouette_avg_list = []
n_clusters_list = []
dicts = {}

def kmeans_clustering(column,top_n_terms,ngram_range=None,fe_type=None,n_clusters=None,max_n_clusters=None):
    """
    K- means clustering for unsupervised learning. User can choose either options:
    (1) provide the number of clusters or
    (2) provide the max number of clusters for kmeans to iterate through, the optimal number of clusters with highest 
    silhouette score will be chosen. Min number of clusters is fixed as 2
    
    params:
    column [series/DataFrame]: column(s) selected for clustering 
                        - series: only one column is selected for clustering (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for clustering (e.g. df[["title_clean","desc_clean"]])
    top_n_terms[int]: the top n terms in each cluster to be printed out
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                   - [default] ngram_range of (1, 1) means only unigrams, 
                                   - ngram_range of (1, 2) means unigrams and bigrams, 
                                   - ngram_range of (2, 2) means only bigram
    fe_type[string/None]: Feature extraction type: Choose "bagofwords" for bow or None for default tfidf method
    n_clusters[None/int]: number of clusters. Choose None for option (2)  
    max_n_clusters[None/int]: max number of clusters. Choose None for option (1)  
    """   
    #call feature extraction function    
    ascending = None 
    X = feature_extraction(column,ngram_range,ascending,fe_type)[0]
    X = X.drop(index='sum')
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]

    #user provides the number of clusters        
    if n_clusters != None:
        model = KMeans(n_clusters = n_clusters, random_state=42)
        model.fit_predict(X)
        labels = model.labels_

        silhouette_score = metrics.silhouette_score(X, labels,random_state=42)
        print("Silhouette score for",n_clusters,"clusters is",round(silhouette_score,3))
        
            
    #user provides the maximum number of clusters 
    if max_n_clusters != None:
        for n_clusters in range(2,max_n_clusters+1): 

            model = KMeans(n_clusters = n_clusters, random_state=42)
            model.fit_predict(X)
            labels = model.labels_

            silhouette_avg = metrics.silhouette_score(X, labels,random_state=42)
            print("For n_clusters =", n_clusters,"The silhouette_score is :", round(silhouette_avg,3))

            silhouette_avg_list.append(silhouette_avg)
            n_clusters_list.append(n_clusters)


        for i in range(len(n_clusters_list)):
            dicts[n_clusters_list[i]] = silhouette_avg_list[i]

        n_clusters_max = max(dicts,key=dicts.get)
        silhouette_avg_max = max(dicts.values())

        model = KMeans(n_clusters = n_clusters_max, random_state=42)
        model.fit_predict(X)
        labels = model.labels_
        n_clusters = n_clusters_max
        print("\nThe optimal number of clusters selected is",n_clusters_max,"with silhouette_score of",round(silhouette_avg_max,3),"\n") 
        
    print("Top",top_n_terms,"terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] #sort by descending order
    terms = vec_type.get_feature_names()
    for i in range(n_clusters):
        print("Cluster %d:" % i)
        print(['%s' % terms[ind] for ind in order_centroids[i, :top_n_terms]]) #top n terms in each cluster
        print("\n")
   
               
    return labels

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
import pandas as pd

silhouette_avg_list = []
n_clusters_list = []
dicts = {}

def kmeans_clustering(column,top_n_terms,ngram_range=None,fe_type=None,n_clusters=None,max_n_clusters=None):
    """
    K- means clustering for unsupervised learning. User can choose either options:
    (1) provide the number of clusters or
    (2) provide the max number of clusters for kmeans to iterate through, the optimal number of clusters with highest 
    silhouette score will be chosen. Min number of clusters is fixed as 2
    
    params:
    column [series/DataFrame]: column(s) selected for clustering 
                        - series: only one column is selected for clustering (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for clustering (e.g. df[["title_clean","desc_clean"]])
    top_n_terms[int]: the top n terms in each cluster to be printed out
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                   - [default] ngram_range of (1, 1) means only unigrams, 
                                   - ngram_range of (1, 2) means unigrams and bigrams, 
                                   - ngram_range of (2, 2) means only bigram
    fe_type[string/None]: Feature extraction type: Choose "bagofwords" for bow or None for default tfidf method
    n_clusters[None/int]: number of clusters. Choose None for option (2)  
    max_n_clusters[None/int]: max number of clusters. Choose None for option (1)  
    """   
    #call feature extraction function    
    ascending = None 
    X = feature_extraction(column,ngram_range,ascending,fe_type)[0]
    X = X.drop(index='sum')
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]

    #user provides the number of clusters        
    if n_clusters != None:
        model = KMeans(n_clusters = n_clusters, random_state=42)
        model.fit_predict(X)
        labels = model.labels_

        silhouette_score = round(metrics.silhouette_score(X, labels,random_state=42),3)
         print("Silhouette score for",n_clusters,"clusters is",round(silhouette_score,3))
#         score = {'No. of clusters',n_clusters,'Silhouette score':silhouette_score}
#         overall_acc = pd.DataFrame([score])
            
    #user provides the maximum number of clusters 
    if max_n_clusters != None:
        for n_clusters in range(2,max_n_clusters+1): 

            model = KMeans(n_clusters = n_clusters, random_state=42)
            model.fit_predict(X)
            labels = model.labels_

            silhouette_avg = metrics.silhouette_score(X, labels,random_state=42)
            
            with open(path+'my_file.txt','w') as f:
                print("For n_clusters =", n_clusters,"The silhouette_score is :", round(silhouette_avg,3))

            silhouette_avg_list.append(silhouette_avg)
            n_clusters_list.append(n_clusters)


        for i in range(len(n_clusters_list)):
            dicts[n_clusters_list[i]] = silhouette_avg_list[i]

        n_clusters_max = max(dicts,key=dicts.get)
        silhouette_avg_max = max(dicts.values())

        model = KMeans(n_clusters = n_clusters_max, random_state=42)
        model.fit_predict(X)
        labels = model.labels_
        n_clusters = n_clusters_max
        print("\nThe optimal number of clusters selected is",n_clusters_max,"with silhouette_score of",round(silhouette_avg_max,3),"\n") 
        
    print("Top",top_n_terms,"terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] #sort by descending order
    terms = vec_type.get_feature_names()
    for i in range(n_clusters):
        print("Cluster %d:" % i)
        print(['%s' % terms[ind] for ind in order_centroids[i, :top_n_terms]]) #top n terms in each cluster
        print("\n")
   
               
    return labels

In [None]:
with open(path+'my_file.txt','a') as f:
    silhouette_avg=1
    n_clusters=2
    print("For n_clusters =", n_clusters,"the silhouette_score is :", round(silhouette_avg,3),file=f)
    

In [None]:
line1 = "First line"
line2 = "Second line"
line3 = "Third line"
with open(path+'my_file.txt','w') as out:
    out.writelines([line1,"\n",line2,"\n",line3])

#### Case 1: user provides the number of clusters ####

In [None]:
column = df1["title_clean"]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type="bagofwords",n_clusters=5,max_n_clusters=None)
df1

In [None]:
#feature extraction
column = df1[["title_clean","desc_clean"]]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type = "bagofwords",n_clusters=5,max_n_clusters=None)
df1

#### Case 2: user provides max number of clusters ### 

In [None]:
column = df1[["title_clean","desc_clean"]]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type ="bagofwords",n_clusters=None,max_n_clusters=20)
df1


In [None]:
#### Case 2: user provides max number of clusters ### 

column = df1["title_clean"]

#k means clustering
df1["cluster"] = kmeans_clustering(column,top_n_terms=10,ngram_range=None,fe_type ="bagofwords",n_clusters=None,max_n_clusters=20)
df1

### ii) LDA

In [None]:
df1 = df1.drop("cluster",axis=1)
df1.head()

In [None]:
# Implementation of LDA:
from sklearn.decomposition import LatentDirichletAllocation

def lda(column,n_components,top_n_terms,ngram_range=None):
    """
    LDA for unsupervised learning. Bag of words is selected for feature extraction
    params:
    column [series/DataFrame]: column(s) selected for lda
                        - series: only one column is selected for lda (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for lda (e.g. df[["title_clean","desc_clean"]])
    n_components[int]: the number of topics/clusters used in the lda_model
    top_n_terms[int]: the top n terms in each topic/cluster to be printed out
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                   - [default] ngram_range of (1, 1) means only unigrams, 
                                   - ngram_range of (1, 2) means unigrams and bigrams, 
                                   - ngram_range of (2, 2) means only bigram
    
    """
    
    #feature extraction
    ascending = None
    fe_type = "bagofwords"
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    vectorized = feature_extraction(column,ngram_range,ascending,fe_type)[2]

    # Create object for the LDA class 
    lda_model = LatentDirichletAllocation(n_components, random_state = 42)  
    lda_model.fit(vectorized)
    
    # Components_ gives us our topic distribution 
    topic_words = lda_model.components_

    # Top n words for a topic

    for i,topic in enumerate(topic_words):
        print(f"The top {top_n_terms} words for topic #{i}")
        print([vec_type.get_feature_names()[index] for index in topic.argsort()[-top_n_terms:]])
        print("\n")
        
    topic_results = lda_model.transform(vectorized) #probabilities of doc belonging to particular topic
    
    
    return topic_results.argmax(axis=1)

In [None]:
#user provides number of component and top n terms in each cluster/topic
column = df1["title_clean"]

#LDA
df1["topic"] = lda(column,n_components=5,top_n_terms=10,ngram_range=None)
df1

In [None]:
column = df1[["title_clean","desc_clean"]]

#LDA
df1["topic"] = lda(column,n_components=5,top_n_terms=10,ngram_range=None)
df1

### iii)  NMF factorization

In [None]:
df1 = df1.drop(["topic"],axis=1)
df1.head()

In [None]:
from sklearn.decomposition import NMF

def nmf(column,n_components,top_n_terms,fe_type,ngram_range=None):
    """
    Non-negative matrix factorization for unsupervised learning.
    params:
    column [series/DataFrame]: column(s) selected for NMF 
                        - series: only one column is selected for NMF (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected for NMF (e.g. df[["title_clean","desc_clean"]])
    n_components[int]: the number of topics/clusters used in NMF
    top_n_terms[int]: the top n terms in each topic/cluster to be printed out
    fe_type[string/None]: Feature extraction type: Choose "bagofwords" for bow or None for default tfidf method
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                   - [default] ngram_range of (1, 1) means only unigrams, 
                                   - ngram_range of (1, 2) means unigrams and bigrams, 
                                   - ngram_range of (2, 2) means only bigram
    """
    #feature extraction
    ngram_range = None
    ascending = None
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    vectorized = feature_extraction(column,ngram_range,ascending,fe_type)[2]

    # Create object for the NMF class 
    nmf_model = NMF(n_components,random_state=42)
    nmf_model.fit(vectorized)
    
    # Components_ gives us our topic distribution 
    topic_words = nmf_model.components_

    # Top n words for a topic

    for i,topic in enumerate(topic_words):
        print(f"The top {top_n_terms} words for topic #{i}")
        print([vec_type.get_feature_names()[index] for index in topic.argsort()[-top_n_terms:]])
        print("\n")
        
    topic_results = nmf_model.transform(vectorized) 
    
    return topic_results.argmax(axis=1)

In [None]:
#user provides number of component and top n terms in each cluster/topic
column = df1["title_clean"]

#NMF
df1["topic"] = nmf(column,n_components=5,top_n_terms=10,fe_type="bagofwords",ngram_range=None)
df1

In [None]:
#user provides number of component and top n terms in each cluster/topic
# #feature extraction
column = df1[["title_clean","desc_clean"]]

#NMF
df1["topic"] = nmf(column,n_components=5,top_n_terms=10,fe_type="bagofwords",ngram_range=None)
df1

### Supervised Learning

In [None]:
#VICE dataset
import pandas as pd
path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"
df= pd.read_csv(path+"sip_sighting_usb_duplicate_ai.csv")
df.head()

In [None]:
df["problem_area"].value_counts()

In [None]:
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description","problem_area"],cols_todrop=None,impute_value=None,subset=None)
df

In [None]:
# data preprocessing
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
n=10
df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["title_rare"] = remove_rarewords(df["title_freq"],n)
df["title_lemma"] = lemmatize_words(column= df["title_rare"],lemma_type=None)
df["title_clean"] = df["title_lemma"]

df["desc_cont"] = [word_contractions(text) for text in df["description"]]
df["desc_lower"] = [lowercase(text) for text in df["desc_cont"]]
df["desc_tag"] = [remove_htmltag_url(text) for text in df["desc_lower"]]
df["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["desc_num"] = [remove_num(text) for text in df["desc_rem"]]
df["desc_white"] = [remove_multwhitespace(text) for text in df["desc_num"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
n=10
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["desc_rare"] = remove_rarewords(df["desc_freq"],n)
df["desc_lemma"] = lemmatize_words(column= df["desc_rare"],lemma_type=None)
df["desc_clean"] = df["desc_lemma"]

df.head()

In [None]:
df = df[["title_clean","desc_clean","problem_area"]]
df

In [None]:
json_path ='C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/ML_Testing/'
user_outpath = json_path + 'user_outpath/'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import metrics
import joblib
import numpy as np 


def supervised_lng(df,user_outpath,target,test_size,ngram_range=None,fe_type=None,model_type=None,ascend=None):

    """
    Consists of 3 supervised machine learning methods: RandomForest (Default), Naive Bayes(optional, SVM (optional)
    
    X[series/DataFrame]: column(s) of text for supervised learning
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]])
    y[series]: target 
    test_size[float/int]: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
                          If int, represents the absolute number of test samples.
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       -[DEFAULT] ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    fe_type[None/string]: Feature extraction type: Choose "bagofwords" or None for default tfidf method
    model_type[None/string]: Choose ML algorithm 
                            - None (Default algorithm is Random Forest)
                            - 'NB'(To choose Naive Bayes as ML algorithm), 
                            - 'SVM'(To choose Support Vector Machine as ML algorithm)
    ascend[True/False/None]:  - None (Default: Confusion matrix is arranged in alphabetical order)
                              - True(Confusion matrix arranged in ascending order of accuracy % per label), 
                              - False(Confusion matrix arranged in descending order of accuracy % per label)  
    save_path[None/string]: Path to save model
                            - None (Default - Model is not saved)
                            - String (Model is saved as model.joblib in the save_path specified as a string)
        
    """
    X= df.drop([target],axis=1)
    y= df[target]   
    
    #TRAIN-TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
    print("Train-test split completed with",(1-test_size)*100,"-",test_size*100,"split in train-test")
    print("Shape of X_train is:", X_train.shape)
    print("Shape of X_test is:",X_test.shape)
    print("Shape of y_train is:",y_train.shape)
    print("Shape of y_test is:",y_test.shape)
    
    if type(X_train) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_train = X_train.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)         

    if type(X_test) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_test = X_test.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    
    #FEATURE EXTRACTION
    column = X_train       
    ascending = None
    #fit_transform X_train
    X_train = feature_extraction(column,ngram_range,ascending,fe_type)[2]
    #only transform X_test
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    X_test = vec_type.transform(X_test)
    
    
    print("Shape of X_train after feature extraction:",X_train.shape)
    print("Shape of X_test after feature extraction:",X_test.shape)
    
    #MODEL BUILDING
    if model_type == None:
        #random forest is chosen by default
        model = RandomForestClassifier(random_state = 42)
    
    if model_type == "NB":
        model = MultinomialNB()
                   
    if model_type == "SVM":
        model = svm.SVC(random_state = 42)
    
    model.fit(X_train, y_train) 
    
    #MODEL SAVING
    
    joblib.dump(model, path + "model.joblib")
    print("Model saved!")

    # predicting test set results
    y_pred = model.predict(X_test)

    # MODEL EVALUATION  
   
    # print('Overall accuracy achieved is ' + str(round(metrics.accuracy_score(y_test, y_pred)*100,2)) + "%")
    # print("Classification report:\n",metrics.classification_report(y_test, y_pred,zero_division=0))
    
    #overall accuracy
    overall_acc = round(metrics.accuracy_score(y_test, y_pred)*100,2)
    overall_acc = {'Overall Acc %':overall_acc}
    overall_acc = pd.DataFrame([overall_acc])
    overall_acc.to_csv(user_outpath+"Overall_Accuracy.csv")

    #classification report
    report = metrics.classification_report(y_test, y_pred,zero_division=0,output_dict=True)
    report = pd.DataFrame(report).transpose()
    report.to_csv(user_outpath+"Classification_Report.csv")

    #confusion matrix with accuracies for each label
    class_accuracies = []

    for class_ in y_test.sort_values(ascending= True).unique():
        class_acc = round(np.mean(y_pred[y_test == class_] == class_)*100,2)
        class_accuracies.append(class_acc)
    class_acc = pd.DataFrame(class_accuracies,index=y_test.sort_values(ascending= True).unique(),columns= ["Accuracy %"])

    cf_matrix = pd.DataFrame(
        metrics.confusion_matrix(y_test, y_pred, labels= y_test.sort_values(ascending= True).unique()), 
        index=y_test.sort_values(ascending= True).unique(), 
        columns=y_test.sort_values(ascending= True).unique()
    )
    
    if ascend == None:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
    else:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)
          
    cf_matrix.to_csv(user_outpath+"Confusion_Matrix.csv",index=False)  

In [None]:
df1= df.copy()
df1= df1.drop(["desc_clean"],axis=1)
df1

In [None]:
target = "problem_area"
test_size = 0.3
ngram_range = None
fe_type = "bagofwords"
model_type = None
supervised_lng(df1,user_outpath,target,test_size,ngram_range,fe_type,model_type,ascend)


In [None]:
test_size = 0.3
ngram_range = None
fe_type = "bagofwords"
model_type = None
save_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"
ascend= None
supervised_lng(df,user_outpath,target,test_size,ngram_range,fe_type,model_type,ascend)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import joblib
import numpy as np 

def deep_lng(df,user_outpath,target,test_size,ngram_range,fe_type,hidden_layer_sizes=None,activation=None,solver=None,learning_rate=None,max_iter=None,ascend=None):
    """
     Deep learning method: MultiLayer Perceptron

    X[series/DataFrame]: column(s) of text for deep learning
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]])   
    y[series]: target
    test_size[float/int]: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
                          If int, represents the absolute number of test samples.
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    fe_type[string]: Feature extraction type: Choose "bagofwords" or "tfidf" method
    hidden_layer_sizes[tuple],default = (100): To set the number of layers and the number of nodes.
                                               Each element in the tuple represents the number of nodes,
                                               length of tuple denotes the total number of hidden layers in the network
    activation["identity", "logistic", "tanh","relu"], default="relu": Activation function for the hidden layer.
    solver["lbfgs", "sgd", "adam"], default="adam": The solver for weight optimization.
    learning_rate["constant", "invscaling", "adaptive"], default="constant": Learning rate schedule for weight updates
    max_iter[int], default=200: Maximum number of iterations. The solver iterates until convergence or this number of iterations.
    ascend [True/False/None]: - None (Default: Confusion matrix is arranged in alphabetical order)
                                 - True(Confusion matrix arranged in ascending order of accuracy % per label), 
                                 - False(Confusion matrix arranged in descending order of accuracy % per label)                            
    save_path[None/string]: Path to save model
                            - None (Default - Model is not saved)
                            - String (Model is saved as model.joblib in the save_path specified as a string)    
    """    
    
    X= df.drop([target],axis=1)
    y= df[target]   
    
    #train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
    print("Train-test split completed with",(1-test_size)*100,"-",test_size*100,"split in train-test")
    print("Shape of X_train is:", X_train.shape)
    print("Shape of X_test is:",X_test.shape)
    print("Shape of y_train is:",y_train.shape)
    print("Shape of y_test is:",y_test.shape)
    
    if type(X_train) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_train = X_train.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)         
        
    if type(X_test) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        X_test = X_test.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
        
    #FEATURE EXTRACTION
    column = X_train
    ascending = None
    #fit_transform X_train
    X_train = feature_extraction(column,ngram_range,ascending,fe_type)[2]
    #only transform X_test
    vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
    X_test = vec_type.transform(X_test)
    print("Shape of X_train after feature extraction:",X_train.shape)
    print("Shape of X_test after feature extraction:",X_test.shape)
    
    #MODEL BUILDING
    #default hypermarameters
    if hidden_layer_sizes == None:
        hidden_layer_sizes = (100)
    if activation == None:
        activation = "relu"
    if solver == None:
        solver = "adam"
    if learning_rate == None:
        learning_rate = "constant"
    if max_iter == None:
        max_iter = 200
    
    print("Hidden layer sizes: ", hidden_layer_sizes,", Activation: ",activation,", Solver: ",solver,", Learning rate: ",learning_rate,", Max iteration: ",max_iter)
    
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=max_iter,verbose = False,random_state=42)
    model.fit(X_train,y_train)
    
    
    #MODEL SAVING    
    joblib.dump(model, path + "mlpmodel.joblib")
    print("Model saved!")

    # predicting test set results
    y_pred = model.predict(X_test)

    # MODEL EVALUATION  
   
    # print('Overall accuracy achieved is ' + str(round(metrics.accuracy_score(y_test, y_pred)*100,2)) + "%")
    # print("Classification report:\n",metrics.classification_report(y_test, y_pred,zero_division=0))
    
    #overall accuracy
    overall_acc = round(metrics.accuracy_score(y_test, y_pred)*100,2)
    overall_acc = {'Overall Acc %':overall_acc}
    overall_acc = pd.DataFrame([overall_acc])
    overall_acc.to_csv(user_outpath+"Overall_Accuracy.csv")

    #classification report
    report = metrics.classification_report(y_test, y_pred,zero_division=0,output_dict=True)
    report = pd.DataFrame(report).transpose()
    report.to_csv(user_outpath+"Classification_Report.csv")

    #confusion matrix with accuracies for each label
    class_accuracies = []

    for class_ in y_test.sort_values(ascending= True).unique():
        class_acc = round(np.mean(y_pred[y_test == class_] == class_)*100,2)
        class_accuracies.append(class_acc)
    class_acc = pd.DataFrame(class_accuracies,index=y_test.sort_values(ascending= True).unique(),columns= ["Accuracy %"])

    cf_matrix = pd.DataFrame(
        metrics.confusion_matrix(y_test, y_pred, labels= y_test.sort_values(ascending= True).unique()), 
        index=y_test.sort_values(ascending= True).unique(), 
        columns=y_test.sort_values(ascending= True).unique()
    )
    
    if ascend == None:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
    else:
        cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)
          
    cf_matrix.to_csv(user_outpath+"Confusion_Matrix.csv",index=False)   
    

In [None]:
test_size = 0.3
ngram_range = None
fe_type = None
hidden_layer_sizes = (5)
activation= None
solver=None
learning_rate=None
max_iter= None
ascend= False

deep_lng(df,user_outpath,target,test_size,ngram_range,fe_type,hidden_layer_sizes,activation,solver,learning_rate,max_iter,ascend)


In [None]:
test_size = 0.3
ngram_range = None
fe_type = None
hidden_layer_sizes = (5)
activation= None
solver=None
learning_rate=None
max_iter= None
ascend= False

deep_lng(df1,user_outpath,target,test_size,ngram_range,fe_type,hidden_layer_sizes,activation,solver,learning_rate,max_iter,ascend)


In [None]:
X = df[["title_clean","desc_clean"]]
y= df["problem_area"]
test_size = 0.3
ngram_range = None
fe_type = None
hidden_layer_sizes = (5,5)
activation= None
solver=None
learning_rate=None
max_iter= None
ascend= False
save_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"

deep_lng(X,y,test_size,ngram_range,fe_type,hidden_layer_sizes,activation,solver,learning_rate,max_iter,ascend,save_path)

### Similarity metrics

In [None]:
#VICE dataset
import pandas as pd
path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/VICE/python_ir/"
df= pd.read_csv(path+"sip_sighting_usb_duplicate_ai.csv")
df.head()

In [None]:
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description"],cols_todrop=None,impute_value=None,subset=None)
df

In [None]:
# data preprocessing
df["title_cont"] = [word_contractions(text) for text in df["title"]]
df["title_lower"] = [lowercase(text) for text in df["title_cont"]]
df["title_tag"] = [remove_htmltag_url(text) for text in df["title_lower"]]
df["title_rem"] = [remove_irrchar_punc(text,char=None) for text in df["title_tag"]]
df["title_num"] = [remove_num(text) for text in df["title_rem"]]
df["title_white"] = [remove_multwhitespace(text) for text in df["title_num"]]
df["title_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["title_white"]]
n=10

df["title_freq"] = remove_freqwords(df["title_stop"],n)
df["title_rare"] = remove_rarewords(df["title_freq"],n)

df["title_lemma"] = lemmatize_words(column= df["title_rare"],lemma_type=None)

df["title_clean"] = df["title_lemma"]

df["desc_cont"] = [word_contractions(text) for text in df["description"]]
df["desc_lower"] = [lowercase(text) for text in df["desc_cont"]]
df["desc_tag"] = [remove_htmltag_url(text) for text in df["desc_lower"]]
df["desc_rem"] = [remove_irrchar_punc(text,char=None) for text in df["desc_tag"]]
df["desc_num"] = [remove_num(text) for text in df["desc_rem"]]
df["desc_white"] = [remove_multwhitespace(text) for text in df["desc_num"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=None,remove_sw=None) for text in df["desc_white"]]
n=10
df["desc_freq"] = remove_freqwords(df["desc_stop"],n)
df["desc_rare"] = remove_rarewords(df["desc_freq"],n)
df["desc_lemma"] = lemmatize_words(column= df["desc_rare"],lemma_type=None)
df["desc_clean"] = df["desc_lemma"]

df.head()


In [None]:
df = df[["title_clean","desc_clean"]]
df

### Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def cosinesimilarity(column,threshold=None,total_rows = None,base_row=None,ngram_range=None,fe_type=None,ascending=None):
    """
    Compute the cosine similarity between rows of texts. User can 
    a) fix number of rows for comparison, each row will be taken as base and compared with the rest
    b) fix one row as base, comparison will be done with all the other rows
    
    params:
    
    column[series/DataFrame]: column(s) of text for row wise similarity comparison
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]])  
    threshold[None/float]: cut off value for the cosine similarity, only texts with values above or equal to threshold
                           will be printed
                        - None: Default threhold is 0.5
                        - float: any value between 0 and 1 
    total_rows[None/int]: Number of rows for comparison, choose None for option b 
    base_row[None/int]: Row fixed as base, choose None for option a 
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    fe_type[None/string]: Feature extraction type: Choose "bagofwords" or None for tfidf
    ascending [True/False/None]: - [default] None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)  
    
    """     
    if type(column) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        column = column.apply(lambda row: ' '.join(row.values.astype(str)), axis=1) 
                
    #feature extraction              
    X = feature_extraction(column=column,ngram_range=ngram_range,ascending=None,fe_type=fe_type)[0]
    X = X.drop(["sum"],axis = 0)
    
    #Get cosine similarity matrix
    similarity_matrix = pd.DataFrame(cosine_similarity(X))
    
    #threshold
    if threshold == None:
        threshold = 0.5
       
    if total_rows !=None: #fix number of rows for comparison, each row will be taken as base and compared with the rest
        for base in range(total_rows): 
            print ("")
            print ("Using index " + str(base) + " as base:") #fix one index as base
            
            #Create empty df
            column_names = ["Index", "Similarity Score", "Text"]
            results = pd.DataFrame(columns = column_names)
            
            for i in range(total_rows): #compare base with other index
                
                if similarity_matrix.iloc[base,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
                    new_row = {'Index':i, 'Similarity Score':round(similarity_matrix.iloc[base,i],4), 'Text':column.iloc[i]}
                    #append row to the dataframe
                    results = results.append(new_row, ignore_index=True)
                    if ascending != None:            
                        results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)
                        
            display(results)
#             print(results['Similarity Score'].mean())
           

    if base_row !=None: #fix base_row index for comparison with all indexes
        print ("Using index " + str(base_row) + " as base:") #fix one index as base
        
        #Create empty df
        column_names = ["Index", "Similarity Score", "Text"]
        results = pd.DataFrame(columns = column_names)
        
        for i in range(len(column)): #compare base_row with other index
            if similarity_matrix.iloc[base_row,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
                new_row = {'Index':i, 'Similarity Score':round(similarity_matrix.iloc[base_row,i],4), 'Text':column.iloc[i]}
                #append row to the dataframe
                results = results.append(new_row, ignore_index=True)
                if ascending != None:            
                    results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)  
                    
        display(results) 
#         print(results['Similarity Score'].mean())

In [None]:
cosinesimilarity(column = df["title_clean"],threshold=None,total_rows=10,base_row=None,ngram_range=None,fe_type=None,ascending=None)

In [None]:
cosinesimilarity(column = df["title_clean"],threshold= 0.5,total_rows=None,base_row=4,ngram_range=None,fe_type=None,ascending=None)

In [None]:
cosinesimilarity(column = df[["title_clean","desc_clean"]],threshold=None,total_rows=10,base_row=None,ngram_range=None,fe_type=None,ascending=None)

In [None]:
cosinesimilarity(column = df[["title_clean","desc_clean"]],threshold=0,total_rows=None,base_row=4,ngram_range=None,fe_type=None,ascending=None)

### Jaccard similarity

In [None]:
def jaccard_similarity(column,threshold=None,total_rows = None,base_row=None,ascending=None):
    """
    Compute the jaccard similarity between texts. User can 
    a) fix number of rows for comparison, each row will be taken as base and compared with the rest
    b) fix one row as base, comparison will be done with all the other rows
    
    params:
    column[series/DataFrame]: column(s) of text for row wise similarity comparison
                        - series: only one column is selected (e.g. df["title_clean"])
                        - DataFrame: more than one column is selected(e.g. df[["title_clean","desc_clean"]]) 
    threshold[None/float]: cut off value for the jaccard similarity, only texts with values above or equal to threshold
                           will be printed
                        - None: Default threhold is 0.5
                        - float: any value between 0 and 1 
    total_rows[None/int]: Number of rows for comparison, choose None for option b 
    base_row[None/int]: Row fixed as base, choose None for option a 
    ascending [True/False/None]: - [default] None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)  
    
    """     
            
    #jaccard score computation
    def get_jaccard_sim(str1, str2):        
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    
    if type(column) == pd.DataFrame: #concat the columns into one string if there is more than one column 
        column = column.apply(lambda row: ' '.join(row.values.astype(str)), axis=1) 
       
    #threshold
    if threshold == None:
        threshold = 0.5
        
    if total_rows !=None: #fix number of rows for comparison, each row will be taken as base and compared with the rest
        for base in range(total_rows): 
            print ("")
            print ("Using index " + str(base) + " as base:") #fix one index as base
            
            #Create empty df
            column_names = ["Index", "Similarity Score", "Text"]
            results = pd.DataFrame(columns = column_names)                   
            
            for i in range(total_rows): #compare base with other index
                jac_score =  round(get_jaccard_sim(column.iloc[base],column.iloc[i]),4)
                if jac_score > threshold: #print if comparison shows that silarity metric is more than threshold
                    new_row = {'Index':i, 'Similarity Score':jac_score, 'Text':column.iloc[i]}
                    #append row to the dataframe
                    results = results.append(new_row, ignore_index=True)
                if ascending != None:            
                    results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)  
                    
            display(results) 
        
    if base_row != None: #fix base_row index for comparison with all indexes
       
        print ("Using index " + str(base_row) + " as base row:") #fix one index as base_row
        #Create empty df
        column_names = ["Index", "Similarity Score", "Text"]
        results = pd.DataFrame(columns = column_names)                   
            
        for i in range(len(column)): #compare base_row with other index
            jac_score = round(get_jaccard_sim(column.iloc[base_row],column.iloc[i]),4)
            if jac_score >= threshold: #print if comparison shows that silarity metric is more than threshold
                new_row = {'Index':i, 'Similarity Score':jac_score, 'Text':column.iloc[i]}
                #append row to the dataframe
                results = results.append(new_row, ignore_index=True)
            if ascending != None:            
                results = results.sort_values(by ='Similarity Score', axis = 0,ascending=ascending)  

        display(results) 

In [None]:
jaccard_similarity(column= df["title_clean"],threshold=0.5,total_rows = 10,base_row=None,ascending=False)

In [None]:
jaccard_similarity(column= df["title_clean"],threshold=0.5,total_rows = None,base_row=4,ascending=None)

In [None]:
jaccard_similarity(column= df[["title_clean","desc_clean"]],threshold=None,total_rows = 10,base_row=None,ascending=False)

In [None]:
jaccard_similarity(column= df[["title_clean","desc_clean"]],threshold=None,total_rows = None,base_row=4,ascending=False)

In [None]:
# #user provides number of component and top n terms in each cluster/topic
# #feature extraction
# column = df3["title_lemma_word"]
# ngram_range = (1,1)
# ascending = False
# fe_type = "bagofwords"
# vec_type = feature_extraction(column,ngram_range,ascending,fe_type)[1]
# vectorized = feature_extraction(column,ngram_range,ascending,fe_type)[2]

# #NMF
# df3["topic"] = nmf(vectorized,vec_type,n_components=17,top_n_terms=10)
# df3

In [None]:
#convert and save train/validation data as .spacy
# out_path = "C:/Users/nchong/"
# db_train = convert_spacy(TRAIN_DATA)
# db_train.to_disk(out_path +'train.spacy') # save the docbin object
# db_val = convert_spacy(VAL_DATA)
# db_val.to_disk(out_path +'val.spacy') # save the docbin object

In [None]:
# !python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy

In [None]:
#load best model
# nlp1 = spacy.load("C:/Users/nchong/output/model-best/") #load the best model

In [None]:
# doc = nlp1("waikitcx hi arisha please provide us the") # input sample text

# spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
# def show_ents(text):
#     doc= nlp1(text)
#     if doc.ents:
#         for ent in doc.ents:
#             return(ent.text+' - '+ent.label_)
#     else:
#         return('No named entities found.')

In [None]:
# def data_loading(path,df=None,date=None):
#     '''
#     Load only files that follow agreed filename format, merge files as single dataframe.
#     Can support incremental aggregation of dataset, by setting arg df as the existing dataframe
#     Returns a single dataframe.
    
#     params:
#     path [string]: path of the files, without filename
#     df [dataframe] (optional,default is None): input existing dataframe to merge with new files
#     date ["string"](optional,default is None): user can choose to load only files from specific date in YYYY-MM-DD format
#     '''
#     filenames = os.listdir(path)
#     file_list=[]
#     dfs = []

#     if df is None: #no existing dataframe
        
#         for file in filenames:
#             # search agreed file format pattern in the filename
#             if date == None:
#                 pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"
                
#             else:
# #              
#                 pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
    
#             match = re.search(pattern,file)
#             #if match is found
#             if match:
#                 pattern = os.path.join(path, file) #join path with file name
#                 file_list.append(pattern) #list of json files that follow the agreed filename

#                 for file in file_list:
#                     with open(file) as f:
#                         #flatten json into pd dataframe
#                         json_data = pd.json_normalize(json.loads(f.read()))
#                         #label which file each row is from 
#                         json_data['file'] = file.rsplit("/", 1)[-1]

#                     dfs.append(json_data)
#                 df = pd.concat(dfs)
                
#     else: #existing dataframe exists and want to append new files to existing dataframe
             
#         for file in filenames:

#             if file not in df["file"].unique(): #check if file is new - to support merging of new dataset with previously read ones

#                 # search agreed file format pattern in the filename
                
#                 if date == None:
#                     pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"

#                 else:
#                     pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
                     
#                 match = re.search(pattern,file)

#                 #if match is found
#                 if match:
#                     json_pattern = os.path.join(path, file) #join path with file name
#                     file_list.append(json_pattern) #list of json files 

#                     for file in file_list:
#                         with open(file) as f:
#                             #flatten json into pd dataframe
#                             json_data = pd.json_normalize(json.loads(f.read()))
#                             #label which file each row is from 
#                             json_data['file'] = file.rsplit("/", 1)[-1]

#                         dfs.append(json_data)
#                     new_df = pd.concat(dfs)           
#                     df=pd.concat([df,new_df])
    
#     return df

In [None]:
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.tokenize import sent_tokenize
# from nltk.tokenize import WhitespaceTokenizer
# from nltk.tokenize import WordPunctTokenizer
# import re
# #remove token method - seperate nltk and split functions 
# def cust_tokenization(column,token_met,token_type,delim =None):
#     """
#     Custom tokenization, 2 options are available: split() or nltk 
#     params:
#     df [dataframe]: input dataframe 
#     token_met["string"]: input tokenization method ("split" or "nltk")
    
#     token_type["string"](use only if token_met= "nltk"): type of nltk tokenization
#     a) token_type = "WordToken" tokenizes a string into a list of words
#     b) token_type = "SentToken" tokenizes a string containing sentences into a list of sentences
#     c) token_type = "WhiteSpaceToken" tokenizes a string on whitespace (space, tab, newline)
#     d) token_type = "WordPunctTokenizer" tokenizes a string on punctuations
         
#     delim["string"](use only if token_met = "split"): specify delimiter to separate strings,
#     default delimiter (delim=None) is whitespace,  an alternate option for token_type = "WhiteSpaceToken"
    
#     """
#     if token_met == "split":
#         if delim==None:
#             print("Text is split by space") #default delimiter is space if not specified 

#         else:
#             print("Text is split by:", delim) #can accept one or more delimiter

#         return column.apply(lambda text: text.split() if delim==None else text.split(delim))
    

#     if token_met == "nltk":
    
#         if token_type == "WordToken":
#             tokenizer = word_tokenize
#         if token_type == "SentToken":
#             tokenizer = sent_tokenize
#         if token_type == "WhiteSpaceToken":
#             tokenizer = WhitespaceTokenizer().tokenize
#         if token_type == "WordPunctTokenizer":
#             tokenizer = WordPunctTokenizer().tokenize

#         return column.apply(lambda text: tokenizer(text))
        
        
    

In [None]:
# from datetime import datetime,timedelta
# def data_loading(path,date_list=None):
#     '''
#     Load only files that follow agreed filename format, merge files as single dataframe.
#     User can choose to load only files from specific date
    
#     params:
#     path [string]: path of the files, without filename
#     date_list ["list"](optional,default is None): user can choose to load only files from specific date in YYYY-MM-DD format
#     '''
    
#     filenames = os.listdir(path)
#     file_list=[]
#     df = pd.DataFrame()
    
#     if date_list == None:
#         for file in filenames:
#             # search agreed file format pattern in the filename

#             pattern = r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$"

#             match = re.search(pattern,file)
                
#             #if match is found
#             if match:
#                 pattern = os.path.join(path, file) #join path with file name
#                 file_list.append(pattern) #list of json files that follow the agreed filename
            
#         print("Files read:",file_list)                   
#         for file in file_list:
#             with open(file) as f:
#                 #flatten json into pd dataframe
#                 json_data = pd.json_normalize(json.loads(f.read()))
#                 json_data = pd.DataFrame(json_data)
#                 #label which file each row is from 
#                 json_data['file'] = file.rsplit("/", 1)[-1]

#             df = df.append(json_data)              
                
#     else:
#         for file in filenames: 
            
#             # search agreed file format pattern in the filename
#             for date in date_list: 
#                 pattern = r"\("+date+r"\)\d+\_\D+\_\d+\.json"
        
#                 match = re.search(pattern,file)
                
#                 #if match is found
#                 if match:
#                     pattern = os.path.join(path, file) #join path with file name
#                     file_list.append(pattern) #list of json files that follow the agreed filename

#         print("Files read:",file_list)     
#         for file in file_list:
#             with open(file) as f:
#                 #flatten json into pd dataframe
#                 json_data = pd.json_normalize(json.loads(f.read()))
#                 json_data = pd.DataFrame(json_data)
#                 #label which file each row is from 
#                 json_data['file'] = file.rsplit("/", 1)[-1]

#             df = df.append(json_data)

#     return df

In [None]:
# #fix number of rows for comparison
# total_rows = 10 #total rows to consider for comparison
# threshold = 0.1 #similarity metric threshold
# column = df[["title_clean"]]

# for base in range(total_rows): 
#     print ("")
#     print ("Using index " + str(base) + " as base:") #fix one index as base
#     print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")

#     for i in range(total_rows): #compare base with other index
#         jac_score =  round(get_jaccard_sim(column.iloc[base].values[0],column.iloc[i].values[0]),4)
#         if jac_score > threshold: #print if comparison shows that silarity metric is more than threshold
#             print(f"{i : <10}{jac_score : <20}{column.iloc[i].values[0] : <500}")

In [None]:
# #fix base_row index for comparison with all indexes
# base_row=4
# threshold = 0
# column = df[["title_clean"]]

# print ("Using index " + str(base_row) + " as base row:") #fix one index as base_row
# print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")

# for i in range(len(column)): #compare base_row with other index
#     jac_score = round(get_jaccard_sim(column.iloc[base_row].values[0],column.iloc[i].values[0]),4)
#     if jac_score >= threshold: #print if comparison shows that silarity metric is more than threshold
#         print(f"{i : <10}{jac_score : <20}{column.iloc[i].values[0] : <500}")

In [None]:
# #feature extraction
# X = feature_extraction(column = df["title_clean"],ngram_range=(1,1),ascending=None,fe_type="tfidf")[0]
# X = X.drop(["sum"],axis = 0)
# X
#Cosine similarity
# from sklearn.metrics.pairwise import cosine_similarity
# similarity_matrix = pd.DataFrame(cosine_similarity(X))
# similarity_matrix
#user give total rows to compare
# total_rows = 10 #total rows to consider for comparison
# threshold = 0.2 #similarity metric threshold
# column = df[["title_clean"]]

# for base in range(total_rows): 
#     print ("")
#     print ("Using index " + str(base) + " as base:") #fix one index as base
#     print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")
#     for i in range(total_rows): #compare base with other index
#         if similarity_matrix.iloc[base,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#             print(f"{i : <10}{round(similarity_matrix.iloc[base,i],4) : <20}{column.iloc[i].values[0] : <500}")
#user give base to compare
# base_row = 4 #base for comparison
# threshold = 0.2 #similarity metric threshold
# column = df[["title_clean"]]

# print ("Using index " + str(base_row) + " as base:") #fix one index as base
# print(f"{'Index' : <10}{'Similarity Score' : <20}{'Title' : <500}")
# for i in range(len(column)): #compare base_row with other index
#     if similarity_matrix.iloc[base_row,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#         print(f"{i : <10}{round(similarity_matrix.iloc[base_row,i],4) : <20}{column.iloc[i].values[0] : <500}")


In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# def cosinesimilarity(column,threshold,total_rows = None,base_row=None,ngram_range=None,fe_type=None):
    
#     #feature extraction
#     if ngram_range == None:
#         ngram_range = (1,1)
#     if fe_type == None:
#         fe_type ="tfidf"
    
       
#     X = feature_extraction(column=column,ngram_range=ngram_range,ascending=None,fe_type=fe_type)[0]
#     X = X.drop(["sum"],axis = 0)
    
#     #Get cosine similarity matrix
#     similarity_matrix = pd.DataFrame(cosine_similarity(X))
    
#     if total_rows !=None:
#         for base in range(total_rows): 
#             print ("")
#             print ("Using index " + str(base) + " as base:") #fix one index as base
#             print(f"{'Index' : <10}{'Similarity Score' : <20}{'Text' : <500}")
#             for i in range(total_rows): #compare base with other index
#                 if similarity_matrix.iloc[base,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#                     print(f"{i : <10}{round(similarity_matrix.iloc[base,i],4) : <20}{column.iloc[i] : <500}")
    
#     if base_row !=None:
#         print ("Using index " + str(base_row) + " as base:") #fix one index as base
#         print(f"{'Index' : <10}{'Similarity Score' : <20}{'Text' : <500}")
#         for i in range(len(column)): #compare base_row with other index
#             if similarity_matrix.iloc[base_row,i] >= threshold: #print if comparison shows that silarity metric is more than threshold
#                 print(f"{i : <10}{round(similarity_matrix.iloc[base_row,i],4) : <20}{column.iloc[i] : <500}")
                

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

# def feature_extraction(column,ngram_range,ascending,fe_type):
#     """
#     Feature extraction methods - Bag of words or TF-IDF
    
#     params:
#     column [series]: column to select
#     ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
#                                        - [default] ngram_range of (1, 1) means only unigrams, 
#                                        - ngram_range of (1, 2) means unigrams and bigrams, 
#                                        - ngram_range of (2, 2) means only bigram
#     ascending [True/False/None]: - None (words arranged in alphabetical order)
#                                  - True(words arranged in ascending order of sum), 
#                                  - False(words arranged in descending order of sum)                               
#     fe_type[string]: Feature extraction type: Choose "bagofwords" or "tfidf" method
#     """
#     if ngram_range == None:
#         ngram_range=(1,1)
    
#     if fe_type == "bagofwords":
#         vec_type = CountVectorizer(ngram_range=ngram_range, analyzer='word')
#         vectorized = vec_type.fit_transform(column)
#         df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
#         df.loc['sum'] = df.sum(axis=0).astype(int)

#     if fe_type == "tfidf":
#         vec_type = TfidfVectorizer(ngram_range=ngram_range, analyzer='word')
#         vectorized = vec_type.fit_transform(column)
#         df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
#         df.loc['sum'] = df.sum(axis=0)
    
#     if ascending != None:
            
#         df = df.sort_values(by ='sum', axis = 1,ascending=ascending)
    
    
#     return df,vec_type,vectorized

In [None]:
# cols = ['title_clean', 'desc_clean']
# df['combined'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
# df

In [None]:
# from configparser import ConfigParser

# # instantiate
# config = ConfigParser()

# # parse ini file
# ini_path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/"
# config.read(ini_path+'default.ini')

# # read values 
# #from data loading section
# path = config.get('dataloading', 'path')

# #from data preprocessing section


# # from ML module section
# #Unsupervised
# #Supervised
# #Similarity metrics
