### Data Loading

In [None]:
#import library
import pandas as pd
import glob, os, json
import re

#user input file path
path = 'C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/team1/'

In [None]:
def data_loading(path,df=None):
    '''
    Load only files that follow agreed filename format, merge files as single dataframe.
    Can support incremental aggregation of dataset, by setting arg df as the existing dataframe
    Returns a single dataframe.
    
    params:
    path [string]: path of the files, without filename
    df [dataframe] (optional,default is None): input existing dataframe to merge with new files
    '''
    filenames = os.listdir(path)
    file_list=[]
    dfs = []

    if df is None: #no existing dataframe
        
        for file in filenames:
            # search agreed file format pattern in the filename
            match = re.search(r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$",file)

            #if match is found
            if match:
                pattern = os.path.join(path, file) #join path with file name
                file_list.append(pattern) #list of json files that follow the agreed filename

                for file in file_list:
                    with open(file) as f:
                        #flatten json into pd dataframe
                        json_data = pd.json_normalize(json.loads(f.read()))
                        #label which file each row is from 
                        json_data['file'] = file.rsplit("/", 1)[-1]

                    dfs.append(json_data)
                df = pd.concat(dfs)
                
    else: #existing dataframe exists and want to append new files to existing dataframe
             
        for file in filenames:

            if file not in df["file"].unique(): #check if file is new - to support merging of new dataset with previously read ones

                # search agreed file format pattern in the filename
                match = re.search(r"^\(\d{4}-\d{2}-\d{1,2}\)\d+\_\D+\_\d+\.json$",file)

                #if match is found
                if match:
                    json_pattern = os.path.join(path, file) #join path with file name
                    file_list.append(json_pattern) #list of json files 

                    for file in file_list:
                        with open(file) as f:
                            #flatten json into pd dataframe
                            json_data = pd.json_normalize(json.loads(f.read()))
                            #label which file each row is from 
                            json_data['file'] = file.rsplit("/", 1)[-1]

                        dfs.append(json_data)
                    new_df = pd.concat(dfs)           
                    df=pd.concat([df,new_df])
    
    return df

In [None]:
os.listdir(path)

In [None]:
df = data_loading(path,df=None)
df.head()

In [None]:
df.to_csv('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/'+'data_original.csv')

### Data Pre-processing

### a) Dataframe manipulation

In [5]:
def df_manipulation(df,how,keep,cols_tokeep=None,cols_todrop=None,impute_value=None,subset=None):
    """
    1) Column selection: Keep or drop columns in dataframe
    2) Data impute: Impute or drop NA rows 
    3) Data duplication cleaning: Drop all duplicates or drop all duplicates except for the first/last occurrence
    params:
    df [dataframe]: input dataframe 
    cols_tokeep [list/None]: list of columns to keep, if there is no list use None
    cols_todrop [list/None]: list of columns to drop, if there is no list use None
    impute_value [string/None]: value to be imputed (i.e "" for empty string). If no value to be imputed but there are 
                        rows to be dropped use None
    how[string]: Drop rows when we have at least one NA or all NA. Choose
                      # - "all": Drop row with all NA
                      # - "any": Drop row with at least one NA
                  
    subset[list/None]: Subset of columns for dropping NA and identifying duplicates, use None if no column to select
    keep[string/False]: Choose to drop all duplicates or drop all duplicates except for the first/last occurrence
                        # - "first" : Drop duplicates except for the first occurrence. 
                        # - "last" : Drop duplicates except for the last occurrence. 
                        # - False : Drop all duplicates.
    """
    
    print("Shape of df before manipulation:",df.shape)

    #Column selection - user can select columns or drop unwanted columns
    if cols_tokeep != None:
        df = df[cols_tokeep]
    if cols_todrop != None:
        df = df.drop(cols_todrop,axis=1)
    print("Shape of df after selecting columns:",df.shape)

    #---Data impute - user can impute or drop rows with NA,freq of null values before & after manipulation returned---#
    print("Number of null values in df:\n",df.isnull().sum())
  

    # impute NA values with user's choice of imputation value
    if impute_value != None:
        df = df.fillna(impute_value)
        print("Number of null values in df after NA imputation:\n",df.isnull().sum())
        
    else: # drop rows with NA values
        df= df.dropna(axis=0, how=how,subset=subset)
        print("Number of null values in df after dropping NA rows:\n",df.isnull().sum())
        print("Shape of df after dropping NA rows:",df.shape)

    #---------Data duplication cleaning--------#
    print("Number of duplicates in the df:", df.duplicated().sum())

    #drop duplicates
    df = df.drop_duplicates(subset=subset, keep=keep)

    print("Shape of df after manipulation:",df.shape)

    return df


In [None]:
df = df_manipulation(df,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)
df


### b) Text Normalization

### 2) Expand contractions 

In [None]:
import contractions

def word_contractions(df):
    """
    Expand word contractions (i.e. "isn't" to "is not")
    params:
    df [dataframe]: input dataframe 
    """
    df = df.applymap(lambda text: " ".join([contractions.fix(word) for word in text.split()]))
    return df


In [None]:
df = word_contractions(df)

In [None]:
df.iloc[149,1]

### 3) Convert all characters into lowercase 

In [None]:
def lowercase(df):
    """
    Convert all characters to lower case
    param:
    df[dataframe]: input dataframe
    """
    df = df.applymap(lambda s:s.lower() if type(s) == str else s)
    return df 

In [None]:
df = lowercase(df)
df

In [None]:
df.iloc[149,1]

In [None]:
df.iloc[10,1]

### 4) Stemming/Lemmatization

### Stemming

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

def stem_words(df,stemmer_type):
    """
    Stemming words, 2 options available: Porter Stemmer or Lancaster Stemmer 
    params:
    df [dataframe]: input dataframe 
    stemmer_type[string]: input stemming method ("Porter" or "Lancaster")
    """
    if stemmer_type == "Porter":
        stemmer = PorterStemmer()
    if stemmer_type == "Lancaster":
        stemmer=LancasterStemmer()
    df = df.applymap(lambda text: " ".join([stemmer.stem(word) for word in text.split()]))
    return df

In [None]:
df = stem_words(df,stemmer_type = "Lancaster")
df.iloc[10,1]

### Lemmatization

In [None]:
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_words(df,lemma_type):
    """
    Lemmatize words, 2 options available: WordNetLemmatizer or Spacy 
    params:
    df [dataframe]: input dataframe 
    lemma_type[string]: input lemmatization method ("WordNet" or "Spacy")
    """
    if lemma_type == "WordNet":
        lemmatizer = WordNetLemmatizer()
        df = df.applymap(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))
    if lemma_type == "Spacy":
        nlp = spacy.load("en_core_web_sm")
        df = df.applymap(lambda text: " ".join([word.lemma_ for word in nlp(text)]))
        #convert to lower case as spacy will convert pronouns to upper case
        df = df.applymap(lambda s:s.lower() if type(s) == str else s) 
    return df


In [None]:
df = lemmatize_words(df,lemma_type = "Spacy")
df

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[149,1]

In [None]:
df = lemmatize_words(df,lemma_type = "WordNet")
df.iloc[10,1]

### b) Noise filtering



### 1) Remove html tag and url

In [None]:
from bs4 import BeautifulSoup
def remove_htmltag_url(df):
    """
    Remove html tag and url
    params:
    df [dataframe]: input dataframe 
    
    """
    #remove html tag
    df = df.applymap(lambda text:BeautifulSoup(text, 'html.parser').get_text(separator= " ",strip=True))
    #remove url
    df = df.replace('https?[://%]*\S+',' ', regex=True) 
    return df

In [None]:
df = remove_htmltag_url(df)
df

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[149,1]

### 3) Remove irrelevant characters, punctuation, special characters

In [None]:
def remove_irrchar_punc(df):
    """
    Remove irrelevant characters and punctuation
    params:
    df [dataframe]: input dataframe 
    
    """
    #remove &nbsp; &quot; and &gt; - anything that starts wth $ and ends with ;
    df = df.replace('\&.+?\;',' ',regex = True)
    #Remove comment id number+name  "++++1562123662 fbakhda\n"  in comment field since not relevant
    df = df.replace('\++.*\\n',' ', regex=True)
    #Remove "image.png\"
    df = df.replace('image.png\\\\',' ', regex=True)
    # Remove eg: cid:image004.jpg@01D66BEC.314074D0\
    df = df.replace('cid:image.*\\\\',' ', regex=True)
    # Remove utf-8 literals
    df = df.replace(r'\\+x[\d\D][\d\D]',' ', regex=True)
    #Remove special characters and punctuation
    df = df.replace('[^\w\s]',' ', regex=True)
    df = df.replace(r"_", " ", regex=True)
    return df


In [None]:
df = remove_irrchar_punc(df)

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[149,1]

In [None]:
df.to_excel('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/20210920/'+'data_rem_irrchars_punc.xlsx')

### 3) Remove numeric data

In [None]:
def remove_num(df):
    """
    Remove numeric data
    params:
    df [dataframe]: input dataframe 
    
    """
    df=df.replace('\d+',' ', regex=True) 

    return df 

In [None]:
df = remove_num(df)
df

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[149,1]

In [None]:
df.to_csv('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/20210920/'+'rem_puncs_withspace.csv')

### 4) Remove multiple whitespaces

In [None]:
def remove_multwhitespace(df):
    """
    Remove multiple white spaces
    params:
    df [dataframe]: input dataframe 
    
    """
    df = df.replace(' +',' ', regex=True)
    return df

In [None]:
df = remove_multwhitespace(df)

In [None]:
df.iloc[10,1]

In [None]:
df.iloc[149,1]

### 4) Remove stopwords

In [None]:
# print(stopwords.words('english'))

In [None]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(text,extra_sw=None,remove_sw=None):
    """
    Removes English stopwords. Optional: user can add own stopwords or remove words from English stopwords  
    params:
    text[string]: input string
    extra_sw [list] (optional): list of words/phrase to be added to the stop words 
    remove_sw [list] (optional): list of words to be removed from the stop words 
    """
    all_stopwords = stopwords.words('english')
    
    #default list of stopwords
    if extra_sw == None and remove_sw==None:
        all_stopwords = all_stopwords
        
    # add more stopwords
    elif remove_sw == None:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        
    # remove stopwords from existing sw list
    elif extra_sw == None:
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
        
    # remove and add stopwords to existing sw list
    else:
        all_stopwords.extend(extra_sw) #add to existing stop words list
        all_stopwords = [e for e in all_stopwords if e not in remove_sw] #remove from existing stop words list
         
  
    for w in all_stopwords:
        pattern = r'\b'+w+r'\b'
        text = re.sub(pattern,' ', text)
                   
    return text 

In [None]:
#list of words/phrase to be added to the stop words 
extra_sw = ["gio","defects",'hsdes',"test cycle","testing"]
#list of words/phrase to be removed from stop words
remove_sw = ["i","am"]
arg1 = extra_sw
arg2 = remove_sw

df["title_stop"]=  [remove_stopwords(text,extra_sw=arg1,remove_sw=arg2) for text in df["title"]]
df["desc_stop"]=  [remove_stopwords(text,extra_sw=arg1,remove_sw=arg2) for text in df["description"]]
df["comments_stop"]=  [remove_stopwords(text,extra_sw=arg1,remove_sw=arg2) for text in df["comments"]]
df.head()

In [None]:
df.iloc[149,1]

In [None]:
df.iloc[10,1]

### 5) Remove frequent words

In [None]:
def remove_freqwords(df,n):
    """
    Remove n frequent words
    params:
    df [dataframe]: input dataframe 
    n [integer]: input number of frequent words to be removed
    """
    from collections import Counter
    cnt = Counter()
    for i in df:
    
        for text in df[i].values:
            for word in text.split():
                cnt[word] += 1
           
    #custom function to remove the frequent words             
    FREQWORDS = set([w for (w, wc) in cnt.most_common(n)])
    
    print("Frequent words that are removed:", set([(w, wc) for (w, wc) in cnt.most_common(n)]))
    df = df.applymap(lambda text: " ".join([word for word in str(text).split() if word not in FREQWORDS]))
    
    return df


In [None]:
df = remove_freqwords(df,10)

In [None]:
df.iloc[149,1]

In [None]:
df.iloc[10,1]

### 6) Remove rare words

In [None]:
def remove_rarewords(df,n):
    """
    Remove n rare words
    params:
    df [dataframe]: input dataframe 
    n [integer]: input number of rare words to be removed
    """
    from collections import Counter
    cnt = Counter()
    for i in df:
    
        for text in df[i].values:
            for word in text.split():
                cnt[word] += 1
           
    #custom function to remove the frequent words             
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n-1:-1]])
    
    print("Rare words that are removed:", set([(w,wc) for (w, wc) in cnt.most_common()[:-n-1:-1]]))
    df = df.applymap(lambda text: " ".join([word for word in str(text).split() if word not in RAREWORDS]))
    
    return df


In [None]:
df = remove_rarewords(df,10)

In [None]:
df.iloc[149,1]

In [None]:
df.iloc[10,1]

In [None]:
df.to_excel('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/20210920/'+'final_withspacylemma.xlsx')

### c) Custom tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer
import re

def cust_tokenization(df,token_met,token_type,delim =None):
    """
    Custom tokenization, 2 options are available: split() or nltk 
    params:
    df [dataframe]: input dataframe 
    token_met["string"]: input tokenization method ("split" or "nltk")
    
    token_type["string"](use only if token_met= "nltk"): type of nltk tokenization
    a) token_type = "WordToken" tokenizes a string into a list of words
    b) token_type = "SentToken" tokenizes a string containing sentences into a list of sentences
    c) token_type = "WhiteSpaceToken" tokenizes a string on whitespace (space, tab, newline)
    d) token_type = "WordPunctTokenizer" tokenizes a string on punctuations
         
    delim["string"](use only if token_met = "split"): specify delimiter to separate strings,
    default delimiter (delim=None) is whitespace,  an alternate option for token_type = "WhiteSpaceToken"
    
    """
    if token_met == "split":
        if delim==None:
            print("Text is split by space") #default delimiter is space if not specified 

        else:
            print("Text is split by:", delim) #can accept one or more delimiter

        df = df.applymap(lambda text: text.split() if delim==None else text.split(delim))

    if token_met == "nltk":
    
        if token_type == "WordToken":
            tokenizer = word_tokenize
        if token_type == "SentToken":
            tokenizer = sent_tokenize
        if token_type == "WhiteSpaceToken":
            tokenizer = WhitespaceTokenizer().tokenize
        if token_type == "WordPunctTokenizer":
            tokenizer = WordPunctTokenizer().tokenize

        df = df.applymap(lambda text: tokenizer(text))
        
    return df

In [None]:
#use split
df = cust_tokenization(df,token_met="split",token_type=None,delim = '.')

In [None]:
#use nltk
df = cust_tokenization(df,token_met="nltk",token_type="WordToken",delim = None)

## d) Custom taxonomy

### i) Configurability for user to provide taxonomy mapping (to remove/remain)

In [None]:
#list of words to remove
remove_tax = ["gio","fields","test"]
#list of words to maintain
include_tax = ["test suite execution","clone defects"]

import re

def custom_tax(text,remove_tax,include_tax):
    for w in remove_tax:
        #row without any item from include_tax -> replace all remove_tax items with empty string
        if all(phrase not in text for phrase in include_tax): 
            pattern = r'\b'+w+r'\b'
            text = re.sub(pattern,' ', text) 
        #row with any item from include_tax -> only replace remove_tax item that is not in include_tax
        else: 
            if all(w not in phrase for phrase in include_tax):
                pattern = r'\b'+w+r'\b'
                text = re.sub(pattern,' ', text) 
    return text    

In [None]:
df["title_tax"]=  [custom_tax(text,remove_tax,include_tax) for text in df["title"]]
df["description_tax"]=  [custom_tax(text,remove_tax,include_tax) for text in df["description"]]
df["comments_tax"]=  [custom_tax(text,remove_tax,include_tax) for text in df["comments"]]
df.head()


### ii)  Named Entity Recognition (Methodology to recommend potential taxonomy)

### a) Run existing Spacy Model

In [None]:
import spacy
#load pre existing spacy model
nlp = spacy.load('en_core_web_sm')

In [None]:
# check pipeline components
nlp.pipe_names 

In [None]:
# Write a function to display basic entity info:
def show_ents_spacy(text):
    doc = nlp(text)
    if doc.ents:
        for ent in doc.ents:
            return(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        return('No named entities found.')

In [None]:
df["title_ner"]=  [show_ents_spacy(text)for text in df["title"]]
df["description_ner"]=  [show_ents_spacy(text) for text in df["description"]]
df["comments_ner"]=  [show_ents_spacy(text) for text in df["comments"]]
df

In [None]:
df.to_excel('C:/Users/nchong/OneDrive - Intel Corporation/Documents/Debug Similarity Analytics and Bucketization Framework/General/Sample json output/HSD ES Raw Data/'+'ner_spacy.xlsx')

In [None]:
df.iloc[23]

In [None]:
df.iloc[25]

### 2) Train custom NER model

In [2]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

def convert_spacy(DATA):
    """
    Convert data into .spacy format
    """
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
        
    return db

    
def custom_ner(TRAIN_DATA,VAL_DATA,path):
    """
    Build custom NER model
    """
    #convert train and validation data into .spacy format
    db_train = convert_spacy(TRAIN_DATA) 
    db_val = convert_spacy(VAL_DATA) 
    
    #save train and validation data in .spacy format in path
    db_train.to_disk(path +'train.spacy')
    db_val.to_disk(path +'val.spacy')
    
    #autofill base_config file saved by user from spacy website
    !python -m spacy init fill-config base_config.cfg config.cfg
    
    #Model building and saving in path
    !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy
    
    print("Custom NER model built and saved!")
 

def entities(TRAIN_DATA,VAL_DATA,path,text):
    print(text)
    custom_ner(TRAIN_DATA,VAL_DATA,path)
     #Load best model
    nlp = spacy.load(path + "/output/model-best/")     
    print("Best model loaded!")
    
    doc= nlp(text) #create doc object
    if doc.ents:
        for ent in doc.ents:
            return(ent.text+' - '+ent.label_)
    else:
        return('No named entities found.')


    
    

# def ner_wrapper(TRAIN_DATA,VAL_DATA,path,text):
#     nlp = custom_ner(TRAIN_DATA,VAL_DATA,path)
#     text = show_ents(text)
    
#     return text

In [6]:
#train data
TRAIN_DATA = [
["jchun wai kit is working on this to enable in new tcp", {"entities": [[0, 13, "NAME"]]}], 
["siewlita pending release", {"entities": [[0, 8, "NAME"]]}],
["hi lim chih quanx per our communication i still have one more question", {"entities": [[3, 17, "NAME"]]}],
["yeetheng the auto test trigger after build complete is working fine today", {"entities": [[0, 8, "NAME"]]}],
["hi jon here is the recipe link weichuan hi can you try to reproduce the issue once more", {"entities": [[3, 6, "NAME"],[31, 39, "NAME"]]}]
]

VAL_DATA = [
["wei chuan has updated me with the sample of test execution by automation manual chart", {"entities": [[0, 9, "NAME"]]}],
["subject gio logs and gio installation hi ajay jonathan i just noticed that star is directing all the logs to gio folder", {"entities": [[41, 45, "NAME"],[46, 55, "NAME"]]}],
["hi firesh final verdict in jenkins coming as fail even after all the triggered tests are passed", {"entities": [[3, 9, "NAME"],[27, 35, "NAME"]]}],
["wai kit below is the requirement needed from gio product defect detection", {"entities": [[0, 7, "NAME"]]}],
["just string field regards robert nowicki", {"entities": [[26, 40, "NAME"]]}]
]

#jupyter notebook and base_config.cfg path have to be the same
path = "C:/Users/nchong/"

#load and clean test data
df_test = pd.read_excel("C:/Users/nchong/test.xlsx",index_col=0)
df_test = df_manipulation(df_test,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)
df_test.head()



Shape of df before manipulation: (600, 3)
Shape of df after selecting columns: (600, 3)
Number of null values in df:
 title          297
description      2
comments       335
dtype: int64
Number of null values in df after NA imputation:
 title          0
description    0
comments       0
dtype: int64
Number of duplicates in the df: 0
Shape of df after manipulation: (600, 3)


Unnamed: 0,title,description,comments
601,gio planning test skipping,i have an issue using the test case api py sc...,fbakhda story has been planned for this sprin...
602,gio stop schedule does not actually stop the ...,we had multiple test recipes queued up in the ...,waikitcx hi arisha please provide us the sche...
603,create new repo under seg tbh dse piv pse,repo name hspe thb kpi it should preferably be...,siewlita hi project is created in gio
604,de cannot delete recipes even they are not li...,trying to delete test recipes yielded the foll...,soonhenx hi ken s ng can you verify the issue...
605,unchecked heartbeatd and gvd logs is eating sp...,after each run the sut has no more space left ...,wteh hi ken s ng thanks for bringing up this ...


In [None]:
df_test["title_ner"]=  [entities(TRAIN_DATA,VAL_DATA,path,text) for text in df_test["title"]]
df_test["description_ner"]= [entities(TRAIN_DATA,VAL_DATA,path,text)for text in df_test["description"]]
df_test["comments_ner"]=  [entities(TRAIN_DATA,VAL_DATA,path,text) for text in df_test["comments"]]
df_test

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1651.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1238.43it/s]


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[i] Saving to output directory: outputCustom NER model built and saved!

[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     25.83   10.81    5.97   57.14    0.11
200     200         48.06    711.39   44.44  100.00   28.57    0.44
400     400          0.00      0.00   44.44  100.00   28.57    0.44
600     600          0.00      0.00   44.44  100.00   28.57    0.44
800     800          0.00      0.00   44.44  100.00   28.57    0.44
1000    1000          0.00      0.00   44.44  100.00   28.57    0.44
1200    1200          0.00      0.00   44.44  100.00   28.57    0.4

[2021-10-07 14:45:36,194] [INFO] Set up nlp object from config
[2021-10-07 14:45:36,202] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-10-07 14:45:36,206] [INFO] Created vocabulary
[2021-10-07 14:45:36,207] [INFO] Finished initializing nlp object
[2021-10-07 14:45:36,283] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1237.18it/s]

Best model loaded!



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 710.39it/s]


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


[2021-10-07 14:47:41,365] [INFO] Set up nlp object from config
[2021-10-07 14:47:41,376] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-10-07 14:47:41,381] [INFO] Created vocabulary
[2021-10-07 14:47:41,382] [INFO] Finished initializing nlp object
[2021-10-07 14:47:41,475] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


[i] Saving to output directory: outputCustom NER model built and saved!
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     25.83   10.81    5.97   57.14    0.11
200     200         48.06    711.39   44.44  100.00   28.57    0.44
400     400          0.00      0.00   44.44  100.00   28.57    0.44
600     600          0.00      0.00   44.44  100.00   28.57    0.44
800     800          0.00      0.00   44.44  100.00   28.57    0.44
1000    1000          0.00      0.00   44.44  100.00   28.57    0.44
1200    1200          0.00      0.00   44.44  100.00   28.57    0.44
1400    1400          0.00      0.00   44.44  100.00   28.57    0.44
1600    1600          0.00      0.00   44.44  100.00   28.57    0.44
1800    1800          0.00      0.00   44.44  100.00   28.57    

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1000.12it/s]

Best model loaded!



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 999.98it/s]


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


[2021-10-07 14:49:41,544] [INFO] Set up nlp object from config
[2021-10-07 14:49:41,560] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-10-07 14:49:41,568] [INFO] Created vocabulary
[2021-10-07 14:49:41,570] [INFO] Finished initializing nlp object
[2021-10-07 14:49:41,725] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     25.83   10.81    5.97   57.14    0.11
200     200         48.06    711.39   44.44  100.00   28.57    0.44
400     400          0.00      0.00   44.44  100.00   28.57    0.44
600     600          0.00      0.00   44.44  100.00   28.57    0.44
800     800          0.00      0.00   44.44  100.00   28.57    0.44
1000    1000          0.00      0.00   44.44  100.00   28.57    0.44
1200    1200          0.00      0.00   44.44  100.00   28.57    0.44
1400    1400          0.00      0.00   44.44  100.00   28.57    0.44
1600    1600          0.00      0.00   44.44  100.00   28.57    0.44
1800    1800          0.00      0.00   44.44  100.00   28.57    0.44
[+] Saved pipeline to output

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Best model loaded!


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 714.34it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 833.39it/s]


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


[2021-10-07 14:51:53,912] [INFO] Set up nlp object from config
[2021-10-07 14:51:53,923] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-10-07 14:51:53,927] [INFO] Created vocabulary
[2021-10-07 14:51:53,928] [INFO] Finished initializing nlp object
[2021-10-07 14:51:54,058] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


[i] Saving to output directory: output
Custom NER model built and saved![i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     25.83   10.81    5.97   57.14    0.11
200     200         48.06    711.39   44.44  100.00   28.57    0.44
400     400          0.00      0.00   44.44  100.00   28.57    0.44
600     600          0.00      0.00   44.44  100.00   28.57    0.44
800     800          0.00      0.00   44.44  100.00   28.57    0.44
1000    1000          0.00      0.00   44.44  100.00   28.57    0.44
1200    1200          0.00      0.00   44.44  100.00   28.57    0.44
1400    1400          0.00      0.00   44.44  100.00   28.57    0.44
1600    1600          0.00      0.00   44.44  100.00   28.57    0.44
1800    1800          0.00      0.00   44.44  100.00   28.57    

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 999.64it/s]

Best model loaded!



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 713.63it/s]


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
#convert and save train/validation data as .spacy
# out_path = "C:/Users/nchong/"
# db_train = convert_spacy(TRAIN_DATA)
# db_train.to_disk(out_path +'train.spacy') # save the docbin object
# db_val = convert_spacy(VAL_DATA)
# db_val.to_disk(out_path +'val.spacy') # save the docbin object

In [None]:
# !python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy

In [None]:
#load best model
# nlp1 = spacy.load("C:/Users/nchong/output/model-best/") #load the best model

In [None]:
# doc = nlp1("waikitcx hi arisha please provide us the") # input sample text

# spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
# doc = nlp1("weichuan hi hashim the feature has been released please verified if the feature work correctly thank you very much") # input sample text
# spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
# def show_ents(text):
#     doc= nlp1(text)
#     if doc.ents:
#         for ent in doc.ents:
#             return(ent.text+' - '+ent.label_)
#     else:
#         return('No named entities found.')

In [None]:
# import pandas as pd
# df_test = pd.read_excel("C:/Users/nchong/test.xlsx",index_col=0)
# df_test = df_manipulation(df_test,how="any",keep="first",cols_tokeep=["title","description","comments"],cols_todrop=None,impute_value="",subset=None)


In [None]:
# df_test["title_ner"]=  [show_ents(text)for text in df_test["title"]]
# df_test["description_ner"]=  [show_ents(text) for text in df_test["description"]]
# df_test["comments_ner"]=  [show_ents(text) for text in df_test["comments"]]
# df_test

### Feature extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def feature_extraction(column,ngram_range,ascending,fe_type):
    """
    Feature extraction methods - Bag of words or TF-IDF
    
    params:
    column [series]: column to select
    ngram_range [tuple(min_n, max_n)]: The lower and upper boundary of the range of n-values for different n-grams to be extracted
                                       - ngram_range of (1, 1) means only unigrams, 
                                       - ngram_range of (1, 2) means unigrams and bigrams, 
                                       - ngram_range of (2, 2) means only bigram
    ascending [True/False/None]: - None (words arranged in alphabetical order)
                                 - True(words arranged in ascending order of sum), 
                                 - False(words arranged in descending order of sum)                               
    fe_type[string]: Feature extraction type: Choose "bagofwords" or "tfidf" method
    """
    
    if fe_type == "bagofwords":
        vec_type = CountVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0).astype(int)

    if fe_type == "tfidf":
        vec_type = TfidfVectorizer(ngram_range=ngram_range, analyzer='word')
        vectorized = vec_type.fit_transform(column)
        df = pd.DataFrame(vectorized.toarray(), columns=vec_type.get_feature_names())
        df.loc['sum'] = df.sum(axis=0)
    
    if ascending != None:
            
        df = df.sort_values(by ='sum', axis = 1,ascending=ascending)
    
    
    return df

In [None]:
column = df.iloc[:3,0]
ngram_range = (1,1)
ascending = None
fe_type = "bagofwords"
feature_extraction(column,ngram_range,ascending,fe_type)

In [None]:
column = df.iloc[:3,0]
ngram_range = (1,1)
ascending = True
fe_type = "tfidf"
feature_extraction(column,ngram_range,ascending,fe_type)

In [None]:
# 1) Fill NAN with empty string
# def columns_req(text_list,df):
#     """
#     Filters df to only include the string columns provided by user
#     Fills NAN with empty string
    
#     params:
#     text_list[list]: list of columns names  
#     df [dataframe]: input dataframe 
#     """
#     df= df[text_list] 
#     df = df.fillna('')

#     return df

In [None]:
# Take only 'title','description','comments'
# text_list = ['title','description','comments']
# df = columns_req(text_list,df)
# df

In [None]:
# list of words to remove
# remove_tax = ["gio","fields","test"]
# #list of words to maintain
# include_tax = ["test suite execution","cloning defects"]

# text = "gio fields test suite execution and test cycle"
# for w in remove_tax: 
#     if all(w not in phrase for phrase in include_tax): #word in include_tax but not in text -> no tse/cd
#         print(w,"yes")
#     else:
#         print(w,"no")

In [None]:
# #list of words to remove
# remove_tax = ["gio","fields","test"]
# #list of words to maintain
# include_tax = ["test suite execution","test provide method"]

# import re
# def custom_tax(text,remove_tax,include_tax):
#     for w in remove_tax: #"gio","fields","test"
#         for phrase in include_tax: #"test suite execution","provide method"
         
#             if w not in phrase: #"gio","fields
#                 pattern = r'\b'+w+r'\b'
#                 text = re.sub(pattern,' ', text)
#             else: #"test"
#                 if phrase not in text:
#                     pattern = r'\b'+w+r'\b'
#                     text = re.sub(pattern,' ', text)                                             
            
#     return text
