In [None]:
import pandas as pd
#get raw data
df = pd.ExcelFile('data.xlsx')
df = df.parse('sheetname', skiprows=3, index_col=None, na_values=['NA'])
df=df[['TemplateGUID','Notes']]
print(len(df))
# dropping ALL duplicte values 
df=df.drop_duplicates()
df=df.dropna()
print(len(df))
print(df.head())
print(df.info())

In [None]:
df.Notes[0]

In [None]:
#Get mapping
df2 = pd.ExcelFile('Template - Summary mapping.xlsx')
df2 = df2.parse('Sheet1')
df2 = df2[['Template name','Template GUID']]
df2.head()

In [None]:
#map GUID and template name
name_list = []
for row in df.TemplateGUID:
  flag = False
  for index, name in df2.iterrows():
    #print(row)
    if row == name['Template GUID']:
      flag = True
      break
  if flag == True:
    name_list.append(name['Template name'])
  else:
    name_list.append("No template name found")
print(len(name_list))
df['TemplateName'] = name_list
df_final = df[['TemplateName','Notes','TemplateGUID']] 
df_final.head()

In [None]:
df['TemplateName'].value_counts()

## Trimming list to unique data with more than 500

In [None]:
df1 = df[df['TemplateName'].map(df['TemplateName'].value_counts()) > 500]
df1.head()
df1['TemplateName'].value_counts()

## Labeling unique value

In [None]:
df1['category_id'] = df1['TemplateName'].factorize()[0]
from io import StringIO
category_id_df = df1[['TemplateName', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'TemplateName']].values)

In [None]:

df1.head()

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df1.groupby('TemplateName').Notes.count().plot.bar(ylim=0)
plt.show()

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, norm='l2', 
                        encoding='latin-1', 
                        ngram_range=(1, 2), 
                        stop_words='english',
                        max_features = 2000
                       )

features = tfidf.fit_transform(df1.Notes).toarray()
# pickle.dump(features, open("tfidf1.pkl", "wb"))
labels = df1.category_id
features.shape

In [None]:
test=tfidf.vocabulary_
pickle.dump(tfidf.vocabulary_, open("vocab.pkl", "wb"))

In [None]:
features

In [None]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for TemplateName, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(TemplateName))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df1['Notes'], df1['TemplateName'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

## Testing

In [None]:
input = [df1.Notes[0]]
print(input)
print(clf.predict(count_vect.transform(input)))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0,max_iter=1000,multi_class='auto',solver='lbfgs'),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
import seaborn as sns

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
from sklearn.model_selection import train_test_split

model = LinearSVC()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df1.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.TemplateName.values, yticklabels=category_id_df.TemplateName.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from IPython.display import display

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 6:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['TemplateName', 'Notes']])
      print('')

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, 
                                    target_names=df1['TemplateName'].unique()))

In [None]:
model.fit(features, labels)

In [None]:
from sklearn.feature_selection import chi2

N = 3
for Product, category_id in sorted(category_to_id.items()):
  indices = np.argsort(model.coef_[category_id])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
  trigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 3][:N]
  print("# '{}':".format(Product))
  print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
  print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
texts = [df1.Notes[0]]
print(input)
text_features = tfidf.transform(input)
predictions = model.predict(text_features)
# predictions_score = model.predict_proba(text_features)
for text, predicted in zip(texts, predictions):
  print('"{}"'.format(text))
  print("  - Predicted as: '{}'".format(id_to_category[predicted]))
#   print("similarity score: ", predictions_score)
  print("")

In [None]:
!pip install joblib

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, 
                                    target_names=df1['TemplateName'].unique()))

In [None]:
#from joblib import dump, load
# save the model to disk
import pickle
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
dump(model, "linearsvc.joblib")

In [None]:
from joblib import load
pipeline = load("linearsvc.joblib")

In [None]:
input = ['Account ID:  RKHS’\nUSER Name : TAN \nHi  seems my User ID been locked\nPlease assist and reset password.\n Thanks & Regards \nJasmine Tan']
text_features = tfidf.transform(input)
print(text_features.shape)

In [None]:
s = pickle.dumps(model)
linear_svc= pickle.loads(s)
linear_svc.predict(text_features)



In [None]:
predictions = pipeline.predict(text_features)
for text, predicted in zip(texts, predictions):
  print('"{}"'.format(text))
  print("  - Predicted as: '{}'".format(id_to_category[predicted]))
  print("")

In [None]:
map = pd.DataFrame(id_to_category, index=[0])
map.to_csv("mapping.csv", header=True , index=False)

In [None]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result=loaded_model.predict(text_features)

In [None]:
print(result[0])

In [None]:
for text, predicted in zip(texts, result):
    

In [None]:
input = [df1.Notes[4]]
print(input)
predicted=loaded_model.predict(tfidf.transform(input))
id_to_category[predicted[0]]

In [None]:
df1.Notes