### Import required modules

In [1]:
import pandas as pd
import pickle
import random
from sklearn.svm import SVC 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

### Read training data

In [2]:
train_data_df = pd.read_csv('processed_data/train_data.csv', index_col=0)

In [3]:
train_data_df.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category,GL_Code_0,GL_Code_1,GL_Code_2,GL_Code_3,...,Item_Description_65,Item_Description_66,Item_Description_67,Item_Description_68,Item_Description_69,Item_Description_70,Item_Description_71,Item_Description_72,Item_Description_73,Item_Description_74
0,15001,VENDOR-1676,5,83.24,artworking typesetting production jun champion...,CLASS-1963,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
1,15002,VENDOR-1883,0,51.18,auto leasing corporate services corning inc ny...,CLASS-1250,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,15004,VENDOR-1999,2,79.02,store management lease rent deltona corp real ...,CLASS-1274,0.0,0.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
3,15005,VENDOR-1771,7,48.5,store construction general requirements coloni...,CLASS-1522,0.0,0.0,0.0,0.0,...,1,0,1,0,0,0,0,0,1,0
4,15006,VENDOR-1331,0,63.35,jul aydin corp contingent labor temp labor con...,CLASS-1376,1.0,0.0,0.0,0.0,...,0,0,0,2,0,0,0,0,0,0


### Selecting required data

In [4]:
X = train_data_df.iloc[:,6:90]
Y = train_data_df.iloc[:,5]

### Split training data into training and validation set in 80:20 ratio

In [5]:
train_x, val_x, train_y, val_y = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Train different classifiers

In [6]:
classifierSVC = SVC() 
classifierSVC.fit(train_x, train_y)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [7]:
classifierMNB = MultinomialNB() 
classifierMNB.fit(train_x, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
SVC_pred_val_y = classifierSVC.predict(val_x)
MNB_pred_val_y = classifierMNB.predict(val_x)

### Print accuracy score of different classifiers

In [9]:
accuracySVC = accuracy_score(val_y, SVC_pred_val_y)
accuracyMNB = accuracy_score(val_y, MNB_pred_val_y)
print("SVC Classifier accuracy is " + str(accuracySVC*100) + "%")
print("MNB Classifier accuracy is " + str(accuracyMNB*100) + "%")

SVC Classifier accuracy is 96.76840215439856%
MNB Classifier accuracy is 96.22980251346499%


In [11]:
save_svc_classifier = open("pickled_classifiers/svc_classifier.pickle","wb")
pickle.dump(classifierSVC, save_svc_classifier)
save_svc_classifier.close()

In [12]:
save_mnb_classifier = open("pickled_classifiers/mnb_classifier.pickle","wb")
pickle.dump(classifierMNB, save_mnb_classifier)
save_mnb_classifier.close()