In [None]:
import nltk, pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import pandas as pd

In [None]:
# Load Data set
df_obf = pd.read_excel("Data/New_data_set.xlsx")   # Training Data set
df_obf = df_obf.fillna('blank_Value') # repalce na with blank value

In [None]:
# To know the weights on the data set by lable
plt.figure(figsize=(10,4))
df_obf.Category.value_counts().plot(kind='bar')   # labeled data set

In [None]:
nltk.download('stopwords')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english')) 

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
# df_obf['PN'] = df_obf['PN'].apply(clean_text)
df_obf['Remark'] = df_obf['Remark'].apply(clean_text)
df_obf['Category'] = df_obf['Category'].str.lower()

In [None]:
X = df_obf['Remark']
y = df_obf['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
my_mod = df_obf['Category'].unique().tolist()
my_mod

In [None]:
# export model for future use
def model_export(name,model):
    model_pkl_file = "models/" + name + ".pkl"  

    with open(model_pkl_file, 'wb') as file:  
        pickle.dump(model, file)

In [None]:
# Model 1
from sklearn.naive_bayes import MultinomialNB   # prob on event happens ()
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

#export Model
model_export('mnb',nb)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_mod))

In [None]:
# Model 2
from sklearn.linear_model import SGDClassifier   #(SVM -> Support vector Machine) linear or nonlinear classification, regression, and even outlier detection tasks
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

#export Model
model_export('svm',sgd)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_mod))

In [None]:
# Model 3
# LogisticRegression
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=1000)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

#export Model
model_export('logiReg',logreg)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_mod))

In [None]:
### Analysis the model ouptut accuracy, precision, recall,  f1-score and support for model evaluation and choose good model for your project