In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
import pandas as pd
from io import StringIO
import pickle
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV


## Data pre-processing

In [None]:
#get raw data
#file name has been changed due to production data
df = pd.ExcelFile('emailclassification_data.xlsx')
df = df.parse('sheet_name',index_col=None, na_values=['NA'])
df=df[['TemplateGUID','Notes']]

# dropping ALL duplicte values 
df=df.drop_duplicates()
df=df.dropna()

#Get mapping
df2 = pd.ExcelFile('ID-categoryname_mapping.xlsx')
df2 = df2.parse('Sheet1')
df2 = df2[['Template name','Template GUID']]

#map GUID and template name
name_list = []
for row in df.TemplateGUID:
  flag = False
  for index, name in df2.iterrows():
    #print(row)
    if row == name['Template GUID']:
      flag = True
      break
  if flag == True:
    name_list.append(name['Template name'])
  else:
    name_list.append("No template name found")

df['TemplateName'] = name_list
df_final = df[['TemplateName','Notes','TemplateGUID']] 
df_final.head()


## Load model

In [None]:
# Testing phase
tf1 = pickle.load(open(r"vocab_colab_enhance.pkl", 'rb'))
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=1, norm='l2', 
                        encoding='latin-1', 
                        ngram_range=(1, 2), 
                        stop_words='english',
                        #max_features = 2000,
                        vocabulary = tf1)
#load mapping
map = pd.read_csv(r'mapping.csv',header=None)
# load the model from disk
filename = r'finalized_model_colab_enhance.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
for i in range(20):
    print(map[map[1]==i][0].item())

## Run prediction

In [None]:
predicted_value = []
for note in df_final.Notes:
    input = [note]
    #input = ['Account ID:  ’\nUSER Name : TAN \nHi  seems my User ID been locked\nPlease assist and reset password.\n Thanks & Regards ']
    x=tfidf.fit_transform(input)
    x.shape
    predicted=loaded_model.predict(x)
#     print(predicted[0])
#     print(map[map[1]==predicted[0]][0].item())
    predicted_result=map[map[1]==predicted[0]][0].item()
    predicted_value.append(predicted_result)
    #predicted_value.append(map[str(predicted[0])][0])
df_final["Predicted Class"] = predicted_value

In [None]:
df_final.head()

In [None]:
df_final.groupby(["TemplateName", "Predicted Class"]).size()

In [None]:
sum = 0
for index,row in df_final.iterrows():
    if row['TemplateName']== row['Predicted Class']:
        sum = sum+1

lenght = len(df_final)
accuracy = sum/lenght
perc = accuracy * 100.0
"Prediction accuracy = {:.2f} %".format(perc) 
    

## Using score value: does this improve accuracy? 
  
### The confidence score for a sample is the signed distance of that sample to the hyperplane.
### Example  
<!-- <img> <src="https://i.stack.imgur.com/ONxTh.png" width ="500" height=500 > -->
![ONxTh%5B1%5D.png](attachment:ONxTh%5B1%5D.png)

## Production Simulation based on score

In [None]:
import numpy as np
predicted_value = []
score_value = []
score_bench = 0.1

for note in df_final.Notes:
    input = [note]
    x=tfidf.fit_transform(input)
    x.shape
    predicted=loaded_model.predict(x)
    score = loaded_model._predict_proba_lr(x)
    score = pd.DataFrame(score)
    score = score[predicted[0]].item()
    if score > score_bench:
        predicted_result=map[map[1]==predicted[0]][0].item()
        predicted_value.append(predicted_result)        
    else:
        predicted_value.append("not classified")
    score_value.append(score)
df_final["Predicted Class"] = predicted_value
df_final["Score"] = score_value
df_final_filter_score = df_final[df_final["Predicted Class"]!= "not classified"]
sum = 0
for index,row in df_final_filter_score.iterrows():
    if row['TemplateName']== row['Predicted Class']:
        sum = sum+1

lenght = len(df_final_filter_score)
accuracy = sum/lenght
perc = accuracy * 100.0
print("Prediction accuracy = {:.2f} %".format(perc))
print("classified record: ", len(df_final_filter_score))
print("unclassified record: ",len(df_final)-len(df_final_filter_score))
print("total record: ",len(df_final))
 

### From experiment result, the accuracy is greatly effected by the untrained class that we have in the data set.