### Data Loading

In [1]:
#import library
import pandas as pd
import glob, os, json
import re

#user input file path
path = 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/'

In [2]:
def data_loading(path,df=None):
    '''
    Load only files that follow agreed filename format, merge files as single dataframe.
    Can support incremental aggregation of dataset, by setting arg df as the existing dataframe
    Returns a single dataframe.
    
    params:
    path [string]: path of the files, without filename
    df [dataframe] (optional,default is None): input existing dataframe to merge with new files
    '''
    filenames = os.listdir(path)
    file_list=[]
    dfs = []

    if df is None: #no existing dataframe
        
        for file in filenames:
            # search agreed file format pattern in the filename
            match = re.search(r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$",file)

            #if match is found
            if match:
                pattern = os.path.join(path, file) #join path with file name
                file_list.append(pattern) #list of json files that follow the agreed filename

                for file in file_list:
                    with open(file) as f:
                        #flatten json into pd dataframe
                        json_data = pd.json_normalize(json.loads(f.read()))
                        #label which file each row is from 
                        json_data['file'] = file.rsplit("/", 1)[-1]

                    dfs.append(json_data)
                df = pd.concat(dfs)
                
    else: #existing dataframe exists and want to append new files to existing dataframe
             
        for file in filenames:

            if file not in df["file"].unique(): #check if file is new - to support merging of new dataset with previously read ones

                # search agreed file format pattern in the filename
                match = re.search(r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$",file)

                #if match is found
                if match:
                    json_pattern = os.path.join(path, file) #join path with file name
                    file_list.append(json_pattern) #list of json files 

                    for file in file_list:
                        with open(file) as f:
                            #flatten json into pd dataframe
                            json_data = pd.json_normalize(json.loads(f.read()))
                            #label which file each row is from 
                            json_data['file'] = file.rsplit("/", 1)[-1]

                        dfs.append(json_data)
                    new_df = pd.concat(dfs)           
                    df=pd.concat([df,new_df])
    
    return df

In [3]:
os.listdir(path)

['(2021-08-25)1_firstSet_1.json', '(2021-08-25)3_secondSet_1.json']

In [4]:
df = data_loading(path,df=None)
df.head()

Unnamed: 0,id,title,description,comments,updated_date,hierarchy_id,rev,tenant,subject,is_current,hierarchy_path,parent_id,record_type,row_num,file
0,1308651592,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu...",2021-07-21 12:30:31.387,,8,iot_platf,support,1,/1201559436/1208431055/1308651592/,1208431055,parent,1,(2021-08-25)1_firstSet_1.json
1,1308671310,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...,2021-05-04 09:30:00.320,,11,iot_platf,support,1,/1201559436/1208431055/1308671310/,1208431055,parent,2,(2021-08-25)1_firstSet_1.json
2,1308673361,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...,2021-05-20 11:47:18.927,,9,iot_platf,support,1,/1201559436/1208431055/1308673361/,1208431055,parent,3,(2021-08-25)1_firstSet_1.json
3,1507656633,[Testing Only] this is enhancement only,Retest some function again.,,2020-03-13 10:16:18.703,,31,iot_platf,support,1,/1201559436/1208431055/1507656633/,1208431055,parent,4,(2021-08-25)1_firstSet_1.json
4,1507656638,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...,2020-06-01 09:49:55.913,,19,iot_platf,support,1,/1201559436/1208431055/1507656638/,1208431055,parent,5,(2021-08-25)1_firstSet_1.json


In [None]:
df.to_csv('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/'+'data_original.csv')

### Data Pre-processing
### a) Text Normalization

### 1) Filter df by the text columns selected by user, fill NA with empty string

In [5]:
# 1) Fill NAN with empty string
def columns_req(text_list,df):
    """
    Filters df to only include the string columns provided by user
    Fills NAN with empty string
    
    params:
    text_list[list]: list of columns names  
    df [dataframe]: input dataframe 
    """
    df= df[text_list] 
    df = df.fillna('')

    return df

In [6]:
# Take only 'title','description','comments'
text_list = ['title','description','comments']
df = columns_req(text_list,df)
df

Unnamed: 0,title,description,comments
0,provide method to update GIO fields from git r...,Please provide a way to update GIO fields from...,"++++1562123662 fbakhda\nHi @Panceac, Cornel Eu..."
1,Test suite execution terminates before executi...,<p>Test suite execution finished before execut...,++++1361513318 cmoala\nsys_tsdval@GL-IAF1-V-S0...
2,Cloning defects from another test cycle is not...,<p>I am trying to clone defects from another t...,++++1361514315 cmoala\nObserved that only impl...
3,[Testing Only] this is enhancement only,Retest some function again.,
4,[Testing Only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput\nHSDES testing. Please ...
...,...,...,...
899,,"<div><span style=""font-size: 12.18px;"">Hello,&...",
900,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",
901,,<div>The schedule test suite allow for the use...,
902,,"<p>Hi Gio Team,</p><p><br /></p><p>Thank you f...",


In [10]:
df.iloc[10,1]

'<p><span style="font-size: 1em;">Observation</span><br /></p><p>Error while importing excel to GIO TCM</p><p>\n\n\n\n\n\n\n\n</p><div class="WordSection1">\n\n<p class="MsoNormal" style="margin-bottom:0in;margin-bottom:.0001pt;line-height:\nnormal;text-autospace:none"><i><b><span style="font-size: 12px; font-family: &quot;Times New Roman&quot;, serif;">Error:&nbsp;</span></b><span style="font-size: 12px; font-family: &quot;Times New Roman&quot;, serif;">&nbsp;Error while\ninserting data under test case TRTL-F008-M id:26332 An exception occurred while\nexecuting \'UPDATE `official_test_cases` SET `master_ref_id` = ?, `auditt_check`\n= ?, `is_parent` = ?, `only_main` = ?, `updated_on` = ? WHERE `id` = ?\' with\nparams [null, 0, 0, 0, &quot;2020-01-06 10:57:58&quot;, 1776689]:\nSQLSTATE[HY000]: General error: 1366 Incorrect string value:\n\'\\xE2\\x80\\x8Bnon...\' for column \'_description\' at row 3</span></i></p><p class="MsoNormal" style="margin-bottom:0in;margin-bottom:.0001pt;line-h

In [8]:
df.iloc[149,1]

"<p>Facing an issue when trying to clone test result from one test cycle to another. Tried to directly clone all the results but failed as it left few test cases behind. </p><p>Manually selected the test cases to clone, but now I'm left with two test cases failed to be cloned - whether automatically or manually.</p>"

In [None]:
df.to_excel('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/20210920/'+'data_3cols.xlsx')

### 2) Expand contractions 

In [11]:
import contractions

def word_contractions(df):
    """
    Expand word contractions (i.e. "isn't" to "is not")
    params:
    df [dataframe]: input dataframe 
    """
    df = df.applymap(lambda text: " ".join([contractions.fix(word) for word in text.split()]))
    return df


In [None]:
df = word_contractions(df)

In [14]:
df.iloc[149,1]

'<p>Facing an issue when trying to clone test result from one test cycle to another. Tried to directly clone all the results but failed as it left few test cases behind. </p><p>Manually selected the test cases to clone, but now I am left with two test cases failed to be cloned - whether automatically or manually.</p>'

### 3) Convert all characters into lowercase 

In [15]:
def lowercase(df):
    """
    Convert all characters to lower case
    param:
    df[dataframe]: input dataframe
    """
    df = df.applymap(lambda s:s.lower() if type(s) == str else s)
    return df 

In [16]:
df = lowercase(df)
df

Unnamed: 0,title,description,comments
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...,"++++1562123662 fbakhda hi @panceac, cornel eug..."
1,test suite execution terminates before executi...,<p>test suite execution finished before execut...,++++1361513318 cmoala sys_tsdval@gl-iaf1-v-s04...
2,cloning defects from another test cycle is not...,<p>i am trying to clone defects from another t...,++++1361514315 cmoala observed that only imple...
3,[testing only] this is enhancement only,retest some function again.,
4,[testing only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput hsdes testing. please i...
...,...,...,...
899,,"<div><span style=""font-size: 12.18px;"">hello,&...",
900,,"<p>hi gio team,</p><p><br /></p><p>thank you f...",
901,,<div>the schedule test suite allow for the use...,
902,,"<p>hi gio team,</p><p><br /></p><p>thank you f...",


In [17]:
df.iloc[149,1]

'<p>facing an issue when trying to clone test result from one test cycle to another. tried to directly clone all the results but failed as it left few test cases behind. </p><p>manually selected the test cases to clone, but now i am left with two test cases failed to be cloned - whether automatically or manually.</p>'

In [18]:
df.iloc[10,1]

'<p><span style="font-size: 1em;">observation</span><br /></p><p>error while importing excel to gio tcm</p><p> </p><div class="wordsection1"> <p class="msonormal" style="margin-bottom:0in;margin-bottom:.0001pt;line-height: normal;text-autospace:none"><i><b><span style="font-size: 12px; font-family: &quot;times new roman&quot;, serif;">error:&nbsp;</span></b><span style="font-size: 12px; font-family: &quot;times new roman&quot;, serif;">&nbsp;error while inserting data under test case trtl-f008-m id:26332 an exception occurred while executing \'update `official_test_cases` set `master_ref_id` = ?, `auditt_check` = ?, `is_parent` = ?, `only_main` = ?, `updated_on` = ? where `id` = ?\' with params [null, 0, 0, 0, &quot;2020-01-06 10:57:58&quot;, 1776689]: sqlstate[hy000]: general error: 1366 incorrect string value: \'\\xe2\\x80\\x8bnon...\' for column \'_description\' at row 3</span></i></p><p class="msonormal" style="margin-bottom:0in;margin-bottom:.0001pt;line-height: normal;text-autosp

### 4) Stemming/Lemmatization

### Stemming

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

def stem_words(df,stemmer_type):
    """
    Stemming words, 2 options available: Porter Stemmer or Lancaster Stemmer 
    params:
    df [dataframe]: input dataframe 
    stemmer_type[string]: input stemming method ("Porter" or "Lancaster")
    """
    if stemmer_type == "Porter":
        stemmer = PorterStemmer()
    if stemmer_type == "Lancaster":
        stemmer=LancasterStemmer()
    df = df.applymap(lambda text: " ".join([stemmer.stem(word) for word in text.split()]))
    return df

In [None]:
df = stem_words(df,stemmer_type = "Lancaster")
df.iloc[10,1]

### Lemmatization

In [19]:
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_words(df,lemma_type):
    """
    Lemmatize words, 2 options available: WordNetLemmatizer or Spacy 
    params:
    df [dataframe]: input dataframe 
    lemma_type[string]: input lemmatization method ("WordNet" or "Spacy")
    """
    if lemma_type == "WordNet":
        lemmatizer = WordNetLemmatizer()
        df = df.applymap(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))
    if lemma_type == "Spacy":
        nlp = spacy.load("en_core_web_sm")
        df = df.applymap(lambda text: " ".join([word.lemma_ for word in nlp(text)]))
        #convert to lower case as spacy will convert pronouns to upper case
        df = df.applymap(lambda s:s.lower() if type(s) == str else s) 
    return df


In [51]:
df = lemmatize_words(df,lemma_type = "Spacy")
df

Unnamed: 0,title,description,comments
0,provide method update fields git repo automati...,provide way update field git repo file mean wh...,fbakhda panceac cornel eugen little background...
1,suite execution terminate execute test,suite execution finish execute test case error...,cmoala sys tsdval gl iaf v usr local log gvlog...
2,cloning defect another working,try clone defect another get message cloning s...,cmoala observe implement defect clone waikitc ...
3,testing enhancement,ret function,
4,testing consultation,enter support need,prajput hsde test ignore change siewlita hsde ...
...,...,...,...
899,,hello import time global domain time kpis ehl ...,
900,,team thank provide kpi metric feature store tr...,
901,,schedule suite allow user clone suit recipe al...,
902,,team thank provide kpi feature plot kpi metric...,


In [52]:
df.iloc[10,1]

'observation error import excel tcm error error insert data trtl f i d exception occur execute update official case set master ref i d auditt check parent main update i d param null sqlstate hy general error incorrect string value non column description row tcm manageability sota fota excel file attach'

In [53]:
df.iloc[149,1]

'face try clone result one another try directly clone result fail leave case behind manually select case clone leave two case fail clone whether automatically manually'

In [None]:
df = lemmatize_words(df,lemma_type = "WordNet")
df.iloc[10,1]

### b) Noise filtering



### 1) Remove html tag and url

In [20]:
from bs4 import BeautifulSoup
def remove_htmltag_url(df):
    """
    Remove html tag and url
    params:
    df [dataframe]: input dataframe 
    
    """
    #remove html tag
    df = df.applymap(lambda text:BeautifulSoup(text, 'html.parser').get_text(separator= " ",strip=True))
    #remove url
    df = df.replace('https?[://%]*\S+',' ', regex=True) 
    return df

In [22]:
df = remove_htmltag_url(df)
df

Unnamed: 0,title,description,comments
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...,"++++1562123662 fbakhda hi @panceac, cornel eug..."
1,test suite execution terminates before executi...,test suite execution finished before executing...,++++1361513318 cmoala sys_tsdval@gl-iaf1-v-s04...
2,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,++++1361514315 cmoala observed that only imple...
3,[testing only] this is enhancement only,retest some function again.,
4,[testing only] this is consultation only,enter the support needed at here ...,++++1661488832 prajput hsdes testing. please i...
...,...,...,...
899,,"hello, please import time global domain: time ...",
900,,"hi gio team, thank you for providing kpi_metri...",
901,,the schedule test suite allow for the user to ...,
902,,"hi gio team, thank you for providing kpi featu...",


In [25]:
df.iloc[10,1]

'observation error while importing excel to gio tcm error: error while inserting data under test case trtl-f008-m id:26332 an exception occurred while executing \'update `official_test_cases` set `master_ref_id` = ?, `auditt_check` = ?, `is_parent` = ?, `only_main` = ?, `updated_on` = ? where `id` = ?\' with params [null, 0, 0, 0, "2020-01-06 10:57:58", 1776689]: sqlstate[hy000]: general error: 1366 incorrect string value: \'\\xe2\\x80\\x8bnon...\' for column \'_description\' at row 3 tcm:   (manageability (sota/fota)) excel file is attached'

In [26]:
df.iloc[149,1]

'facing an issue when trying to clone test result from one test cycle to another. tried to directly clone all the results but failed as it left few test cases behind. manually selected the test cases to clone, but now i am left with two test cases failed to be cloned - whether automatically or manually.'

### 3) Remove irrelevant characters, punctuation, special characters

In [27]:
def remove_irrchar_punc(df):
    """
    Remove irrelevant characters and punctuation
    params:
    df [dataframe]: input dataframe 
    
    """
    #remove &nbsp; &quot; and &gt; - anything that starts wth $ and ends with ;
    df = df.replace('\&.+?\;',' ',regex = True)
    #Remove comment id number+name  "++++1562123662 fbakhda\n"  in comment field since not relevant
    df = df.replace('\++.*\\n',' ', regex=True)
    #Remove "image.png\"
    df = df.replace('image.png\\\\',' ', regex=True)
    # Remove eg: cid:image004.jpg@01D66BEC.314074D0\
    df = df.replace('cid:image.*\\\\',' ', regex=True)
    # Remove utf-8 literals
    df = df.replace(r'\\+x[\d\D][\d\D]',' ', regex=True)
    #Remove special characters and punctuation
    df = df.replace('[^\w\s]',' ', regex=True)
    df = df.replace(r"_", " ", regex=True)
    return df


In [28]:
df = remove_irrchar_punc(df)

In [29]:
df.iloc[10,1]

'observation error while importing excel to gio tcm error  error while inserting data under test case trtl f008 m id 26332 an exception occurred while executing  update  official test cases  set  master ref id        auditt check        is parent        only main        updated on      where  id       with params  null  0  0  0   2020 01 06 10 57 58   1776689   sqlstate hy000   general error  1366 incorrect string value      non     for column   description  at row 3 tcm     manageability  sota fota   excel file is attached'

In [30]:
df.iloc[149,1]

'facing an issue when trying to clone test result from one test cycle to another  tried to directly clone all the results but failed as it left few test cases behind  manually selected the test cases to clone  but now i am left with two test cases failed to be cloned   whether automatically or manually '

In [None]:
df.to_excel('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/20210920/'+'data_rem_irrchars_punc.xlsx')

### 3) Remove numeric data

In [31]:
def remove_num(df):
    """
    Remove numeric data
    params:
    df [dataframe]: input dataframe 
    
    """
    df=df.replace('\d+',' ', regex=True) 

    return df 

In [32]:
df = remove_num(df)
df

Unnamed: 0,title,description,comments
0,provide method to update gio fields from git r...,please provide a way to update gio fields from...,fbakhda hi panceac cornel eugen can ...
1,test suite execution terminates before executi...,test suite execution finished before executing...,cmoala sys tsdval gl iaf v s usr lo...
2,cloning defects from another test cycle is not...,i am trying to clone defects from another test...,cmoala observed that only implemented de...
3,testing only this is enhancement only,retest some function again,
4,testing only this is consultation only,enter the support needed at here,prajput hsdes testing please ignore any...
...,...,...,...
899,,hello please import time global domain time ...,
900,,hi gio team thank you for providing kpi metri...,
901,,the schedule test suite allow for the user to ...,
902,,hi gio team thank you for providing kpi featu...,


In [33]:
df.iloc[10,1]

'observation error while importing excel to gio tcm error  error while inserting data under test case trtl f  m id   an exception occurred while executing  update  official test cases  set  master ref id        auditt check        is parent        only main        updated on      where  id       with params  null                              sqlstate hy    general error    incorrect string value      non     for column   description  at row   tcm     manageability  sota fota   excel file is attached'

In [34]:
df.iloc[149,1]

'facing an issue when trying to clone test result from one test cycle to another  tried to directly clone all the results but failed as it left few test cases behind  manually selected the test cases to clone  but now i am left with two test cases failed to be cloned   whether automatically or manually '

In [None]:
df.to_csv('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/20210920/'+'rem_puncs_withspace.csv')

### 4) Remove multiple whitespaces

In [35]:
def remove_multwhitespace(df):
    """
    Remove multiple white spaces
    params:
    df [dataframe]: input dataframe 
    
    """
    df = df.replace(' +',' ', regex=True)
    return df

In [36]:
df = remove_multwhitespace(df)

In [37]:
df.iloc[10,1]

'observation error while importing excel to gio tcm error error while inserting data under test case trtl f m id an exception occurred while executing update official test cases set master ref id auditt check is parent only main updated on where id with params null sqlstate hy general error incorrect string value non for column description at row tcm manageability sota fota excel file is attached'

In [38]:
df.iloc[149,1]

'facing an issue when trying to clone test result from one test cycle to another tried to directly clone all the results but failed as it left few test cases behind manually selected the test cases to clone but now i am left with two test cases failed to be cloned whether automatically or manually '

### 4) Remove stopwords

In [None]:
# print(stopwords.words('english'))

In [39]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(df):
    """
    Remove English stopwords 
    params:
    df [dataframe]: input dataframe 
    
    """
    STOPWORDS = set(stopwords.words('english'))
    df = df.applymap(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
                     
    return df

In [40]:
df = remove_stopwords(df)

In [41]:
df.iloc[149,1]

'facing issue trying clone test result one test cycle another tried directly clone results failed left test cases behind manually selected test cases clone left two test cases failed cloned whether automatically manually'

In [42]:
df.iloc[10,1]

'observation error importing excel gio tcm error error inserting data test case trtl f id exception occurred executing update official test cases set master ref id auditt check parent main updated id params null sqlstate hy general error incorrect string value non column description row tcm manageability sota fota excel file attached'

### 5) Remove frequent words

In [43]:
def remove_freqwords(df,n):
    """
    Remove n frequent words
    params:
    df [dataframe]: input dataframe 
    n [integer]: input number of frequent words to be removed
    """
    from collections import Counter
    cnt = Counter()
    for i in df:
    
        for text in df[i].values:
            for word in text.split():
                cnt[word] += 1
           
    #custom function to remove the frequent words             
    FREQWORDS = set([w for (w, wc) in cnt.most_common(n)])
    
    print("Frequent words that are removed:", set([(w, wc) for (w, wc) in cnt.most_common(n)]))
    df = df.applymap(lambda text: " ".join([word for word in str(text).split() if word not in FREQWORDS]))
    
    return df


In [44]:
df = remove_freqwords(df,10)

Frequent words that are removed: {('project', 1340), ('case', 1520), ('hi', 3353), ('jchun', 1558), ('cycle', 1858), ('client', 1428), ('please', 1959), ('issue', 1851), ('test', 7500), ('gio', 6544)}


In [45]:
df.iloc[149,1]

'facing trying clone result one another tried directly clone results failed left cases behind manually selected cases clone left two cases failed cloned whether automatically manually'

In [46]:
df.iloc[10,1]

'observation error importing excel tcm error error inserting data trtl f id exception occurred executing update official cases set master ref id auditt check parent main updated id params null sqlstate hy general error incorrect string value non column description row tcm manageability sota fota excel file attached'

### 6) Remove rare words

In [47]:
def remove_rarewords(df,n):
    """
    Remove n rare words
    params:
    df [dataframe]: input dataframe 
    n [integer]: input number of rare words to be removed
    """
    from collections import Counter
    cnt = Counter()
    for i in df:
    
        for text in df[i].values:
            for word in text.split():
                cnt[word] += 1
           
    #custom function to remove the frequent words             
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n-1:-1]])
    
    print("Rare words that are removed:", set([(w,wc) for (w, wc) in cnt.most_common()[:-n-1:-1]]))
    df = df.applymap(lambda text: " ".join([word for word in str(text).split() if word not in RAREWORDS]))
    
    return df


In [48]:
df = remove_rarewords(df,10)

Rare words that are removed: {('hopefully', 2), ('piie', 2), ('misunderstand', 2), ('gosse', 2), ('cqn', 2), ('sqba', 2), ('katheine', 2), ('sq', 2), ('enhancment', 2), ('super', 2)}


In [49]:
df.iloc[149,1]

'facing trying clone result one another tried directly clone results failed left cases behind manually selected cases clone left two cases failed cloned whether automatically manually'

In [50]:
df.iloc[10,1]

'observation error importing excel tcm error error inserting data trtl f id exception occurred executing update official cases set master ref id auditt check parent main updated id params null sqlstate hy general error incorrect string value non column description row tcm manageability sota fota excel file attached'

In [None]:
df.to_excel('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/20210920/'+'final_withspacylemma.xlsx')

### c) Custom tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer
import re

def cust_tokenization(df,token_met,token_type,delim =None):
    """
    Custom tokenization, 2 options are available: split() or nltk 
    params:
    df [dataframe]: input dataframe 
    token_met["string"]: input tokenization method ("split" or "nltk")
    
    token_type["string"](use only if token_met= "nltk"): type of nltk tokenization
    a) token_type = "WordToken" tokenizes a string into a list of words
    b) token_type = "SentToken" tokenizes a string containing sentences into a list of sentences
    c) token_type = "WhiteSpaceToken" tokenizes a string on whitespace (space, tab, newline)
    d) token_type = "WordPunctTokenizer" tokenizes a string on punctuations
         
    delim["string"](use only if token_met = "split"): specify delimiter to separate strings,
    default delimiter (delim=None) is whitespace,  an alternate option for token_type = "WhiteSpaceToken"
    
    """
    if token_met == "split":
        if delim==None:
            print("Text is split by space") #default delimiter is space if not specified 

        else:
            print("Text is split by:", delim) #can accept one or more delimiter

        df = df.applymap(lambda text: text.split() if delim==None else text.split(delim))

    if token_met == "nltk":
    
        if token_type == "WordToken":
            tokenizer = word_tokenize
        if token_type == "SentToken":
            tokenizer = sent_tokenize
        if token_type == "WhiteSpaceToken":
            tokenizer = WhitespaceTokenizer().tokenize
        if token_type == "WordPunctTokenizer":
            tokenizer = WordPunctTokenizer().tokenize

        df = df.applymap(lambda text: tokenizer(text))
        
    return df

In [None]:
#use split
df = cust_tokenization(df,token_met="split",token_type=None,delim = '.')

In [None]:
#use nltk
df = cust_tokenization(df,token_met="nltk",token_type="WordToken",delim = None)

In [None]:
#remove html tag
# from bs4 import BeautifulSoup
# df["title"] = [BeautifulSoup(text).get_text() for text in df["title"]]
# df["description"] = [BeautifulSoup(text).get_text() for text in df["description"]]
# df["comments"] = [BeautifulSoup(text).get_text() for text in df["comments"]]
# df

In [None]:
# Define the function to remove the punctuation
#### Remove punctuation including underscore
# \w =  word character i.e. characters which are from a to z, A to Z, 0 to 9, _
# \s = matches whitespace (spaces, tabs and new lines)
# import string 
# def remove_punctuation(df):
#     df = df.replace('[^\w\s_]', ' ',regex=True)
#     return df

In [None]:
# from bs4 import BeautifulSoup
# def remove_htmltag(html):
  
#     # parse html content
#     soup = BeautifulSoup(html, "html.parser")
  
#     for data in soup(['style', 'script']):
#         # Remove tags
#         data.decompose()
  
#     # return data by retrieving the tag content
#     return ' '.join(soup.stripped_strings)

# df = df.applymap(remove_htmltag)
# df

In [None]:
# import string
# from pandas import DataFrame
# def remove_punctuations(text):
#     for punctuation in string.punctuation:
#         text = text.replace(punctuation, ' ')
#     return text

# # df = df.applymap lambda text:[text.replace(punctuation, ' ') for punctuation in string.punctuation])
# #  df = df.applymap(lambda s:s.lower() if type(s) == str else s)
# df = df.applymap(remove_punctuations)
# df