# Imports

In [1]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from gensim.models import Word2Vec
from gensim.models import FastText
from nltk import word_tokenize
from preprocessing import PreProcessor #this is our own preprocessor
from sys import stdout
import warnings 
import numpy as np
import tqdm
import scipy
import time
import pandas as pd
import matplotlib.pyplot as plt
warnings.simplefilter('ignore')
preprocess = PreProcessor().preprocess

# Vectorization

In [2]:
tfidfvect = TfidfVectorizer(ngram_range=(1,1))
tfidfngramsvecttwograms = TfidfVectorizer(ngram_range=(1,2))
tfidfngramsvectthreegrams = TfidfVectorizer(ngram_range=(1,3))

def tfidf(corpus, ngrams):
    if ngrams == 1:
        return tfidfvect.fit_transform(corpus)
    elif ngrams == 2:
        return tfidfngramsvecttwograms.fit_transform(corpus)
    else: return tfidfngramsvectthreegrams.fit_transform(corpus)

def word2vec(corpus, size):
    tokenized = [word_tokenize(row) for row in corpus]
    model = Word2Vec(tokenized, size=size, workers=8)
    vectors = []
    for i, row in enumerate(tokenized):
        sentence_vectors = [model.wv[word] for word in row if word in model.wv]
        if len(sentence_vectors) == 0:
            vectors.append([0] * size)
        else:
            sentence_vector = np.average(sentence_vectors, axis=0)
            vectors.append(sentence_vector)
    return vectors, model

def fasttext(corpus, size):
    tokenized = [word_tokenize(row) for row in corpus]
    model = FastText(tokenized, size=size, workers=2)
    vectors = []
    for i, row in enumerate(tokenized):
        sentence_vectors = [model.wv[word] for word in row]
        if len(sentence_vectors) == 0:
            vectors.append([0] * size)
        else:
            sentence_vector = np.average(sentence_vectors, axis=0)
            vectors.append(sentence_vector)
    return vectors, model

In [3]:
def Vectorize(data):
    vectors = []
    
    print("Preprocessing for Linear SVC")
    MinimalStripping = data.apply(lambda d: preprocess(d , lower_case=False, strip=True, punctuation=False, numbers=False, unicode=False, cut_off=False, stop_words=False, stemming=False, lemmatizing=False, min_word_length=-1, max_word_length=-1, tokenize=False))
    print("Done preprocessing, Vectorizing for Linear SVC")
    vectors.append(tfidf(MinimalStripping, 2))
    
    print("Preprocessing for Random Forest")
    DefaultStripping = data.apply(lambda d: preprocess(d, lower_case=True, strip=True, punctuation=True, numbers=True, unicode=True, cut_off=True, stop_words=True, stemming=False, lemmatizing=True, min_word_length=1, max_word_length=-1, tokenize=False))
    print("Done preprocessing, Vectorizing for Random Forest")
    vectors.append(tfidf(DefaultStripping, 1))
    
    print("Preprocessing for Linear Discriminant")
    ExtremeStripping = data.apply(lambda d: preprocess(d, lower_case=True, strip=True, punctuation=True, numbers=True, unicode=True, cut_off=True, stop_words=True, stemming=False, lemmatizing=True, min_word_length=1, max_word_length=-1, tokenize=False))
    print("Done preprocessing, Vectorizing for Linear Discriminant")
    fasttextvectors, fasttextmodel = fasttext(ExtremeStripping, 64)
    vectors.append(fasttextvectors)
    
    print("Preprocessing for Decision Tree")
    DefaultStrippingStemming = data.apply(lambda d: preprocess(d, lower_case=True, strip=True, punctuation=True, numbers=True, unicode=True, cut_off=True, stop_words=False, stemming=True, lemmatizing=False, min_word_length=1, max_word_length=-1, tokenize=False))
    print("Done preprocessing, Vectorizing for Decision Tree")
    w2vvectors, w2vmodel = word2vec(DefaultStrippingStemming, 256)
    vectors.append(w2vvectors)
    
    print("No preprocessing needed for Logistic Regression, Vectorizing for Logistic Regression")
    vectors.append(tfidf(MinimalStripping, 3))
    
    print("No preprocessing needed for Multinomial Naive Bayes, Vectorizing for Multinomial Naive Bayes")
    w2vvectors2, w2vmodel2 = word2vec(DefaultStrippingStemming, 256)
    vectors.append(w2vvectors2)
    
    return vectors, w2vmodel, w2vmodel2, fasttextmodel

# Vectorization for new predictions

In [4]:
def PreprocessMinimalTwoGrams(sentence, model):
    return tfidfngramsvecttwograms.transform([preprocess(sentence, lower_case=False, strip=True, punctuation=False, numbers=False, unicode=False, cut_off=False, stop_words=False, stemming=False, lemmatizing=False, min_word_length=-1, max_word_length=-1, tokenize=False)])

def PreprocessMinimalThreeGrams(sentence, model):
    return tfidfngramsvectthreegrams.transform([preprocess(sentence, lower_case=False, strip=True, punctuation=False, numbers=False, unicode=False, cut_off=False, stop_words=False, stemming=False, lemmatizing=False, min_word_length=-1, max_word_length=-1, tokenize=False)])

def PreprocessDefault(sentence, model):
    return tfidfvect.transform([preprocess(sentence, lower_case=True, strip=True, punctuation=True, numbers=True, unicode=True, cut_off=True, stop_words=True, stemming=False, lemmatizing=True, min_word_length=1, max_word_length=-1, tokenize=False)])

def PreprocessExtremeStemming(sentence, model):
    PExtremeStemming = preprocess(str(sentence), lower_case=True, strip=True, punctuation=True, numbers=True, unicode=True, cut_off=True, stop_words=True, stemming=False, lemmatizing=True, min_word_length=1, max_word_length=-1, tokenize=False)
    return np.average([model.wv[word] for word in word_tokenize(PExtremeStemming) if word in model.wv], axis=0)

def PreprocessDefaultStemming(sentence, model):
    PDefaultStemming = preprocess(str(sentence), lower_case=True, strip=True, punctuation=True, numbers=True, unicode=True, cut_off=True, stop_words=False, stemming=True, lemmatizing=False, min_word_length=1, max_word_length=-1, tokenize=True)
    vectors = [model.wv[word] for word in PDefaultStemming if word in model.wv]
    if len(vectors) == 0:
        return np.random.uniform(low=-1, high=1, size=(256,))
    else:
        return np.average(vectors, axis=0)

# Model training

In [5]:
def CalcAccuracyPerCategory(model, X, Y):
    unique = Y.unique()
    result = pd.DataFrame(unique, columns = ['Category']) 
    result['true'] = 0
    result['false'] = 0
    result['accuracy'] = 0
    
    y_pred = model.predict(X)
    
    for index in range(len(y_pred)):
        
        i = result.index[result['Category'] == Y.values[index]].tolist()[0]
        if Y.values[index] == y_pred[index]: 
             result.at[i,'true'] = result.at[i,'true']+1
        else: 
             result.at[i,'false'] = result.at[i,'false']+1
                
    
    for i, row in result.iterrows():
        total = row['true'] + row['false']
        result.at[i,'accuracy'] = row['true']/total * 100
    return result
        
def TrainModels(dataframe, vectors, w2vmodel, w2vmodel2, fasttextmodel):
    models = []    
    
    trainX, testX, trainY, testY = train_test_split(vectors[0], dataframe['Category'], test_size=0.2, random_state=0)
    trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2, random_state=0) 
    modelSVC = LinearSVC().fit(trainX, trainY)
    accuracySVC = CalcAccuracyPerCategory(modelSVC, validateX, validateY)
    models.append([modelSVC, accuracySVC, "Linear SVC", PreprocessMinimalTwoGrams, None])
    print("Done training model Linear SVC")
    
    
    trainX, testX, trainY, testY = train_test_split(vectors[1], dataframe['Category'], test_size=0.2, random_state=0)
    trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2, random_state=0)
    modelRF = RandomForestClassifier().fit(trainX, trainY)
    accuracyRF = CalcAccuracyPerCategory(modelRF, validateX, validateY)
    models.append([modelRF, accuracyRF, "Random Forest", PreprocessDefault, None])
    print("Done training model Random Forest")
    
    
    trainX, testX, trainY, testY = train_test_split(vectors[2], dataframe['Category'], test_size=0.2, random_state=0)
    trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2, random_state=0)
    modelLD = LinearDiscriminantAnalysis().fit(trainX, trainY)
    accuracyLD = CalcAccuracyPerCategory(modelLD, validateX, validateY)
    models.append([modelLD, accuracyLD, "Linear Discriminant", PreprocessExtremeStemming, fasttextmodel])
    print("Done training model Linear Discriminant")

    
    trainX, testX, trainY, testY = train_test_split(vectors[3], dataframe['Category'], test_size=0.2, random_state=0)
    trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2, random_state=0)      
    modelDT = DecisionTreeClassifier().fit(trainX, trainY)
    accuracyDT = CalcAccuracyPerCategory(modelDT, validateX, validateY)
    models.append([modelDT, accuracyDT, "Decision Tree", PreprocessDefaultStemming, w2vmodel])
    print("Done training model Decision Tree")
    
    trainX, testX, trainY, testY = train_test_split(vectors[4], dataframe['Category'], test_size=0.2, random_state=0)
    trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2, random_state=0)
    lrc = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    lrc.fit(trainX, trainY)
    accuracyLR = CalcAccuracyPerCategory(lrc, validateX, validateY)
    models.append([lrc, accuracyLR, "Logistic Regression", PreprocessMinimalThreeGrams, None])
    print("Done training model Logistic Regression")
    
    trainX, testX, trainY, testY = train_test_split(vectors[5], dataframe['Category'], test_size=0.2, random_state=0)
    trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2, random_state=0)
    gnb = GaussianNB()
    gnb.fit(trainX, trainY)
    accuracyNB = CalcAccuracyPerCategory(gnb, validateX, validateY)
    models.append([gnb, accuracyNB, "Gaussian Naive Bayes", PreprocessDefaultStemming, w2vmodel])
    print("Done training model Gaussian Naive Bayes")
    
    time.sleep(1)
    return models

# Prediction

In [6]:
def percentage(x, total):
    return x/total * 100

def Predict(models, sentence):
    data = []
    
    for model in models:
        preprocessedsentencevector = model[3](sentence, model[4])
        try:
            prediction = list(model[0].predict(preprocessedsentencevector))[0]
        except:
            prediction = list(model[0].predict([preprocessedsentencevector]))[0]
        index = model[1].index[model[1]['Category'] == prediction].tolist()[0]
        accuracy = model[1].at[index,'accuracy']
        data.append([prediction, accuracy])
        
    result = pd.DataFrame(data, columns = [ 'Category', 'Accuracy']) 
    result = result.groupby(['Category'], as_index=False).sum()
    totalaccuracy = result['Accuracy'].sum()
    
    result['Accuracy'] = result['Accuracy'].apply(lambda d: percentage(d, totalaccuracy))
    return result.sort_values(by=['Accuracy'], ascending=False).values[0][0]

def PredictMultiple(models, sentences):
    print("Testing multiple models")
    y_pred = []
    time.sleep(1)
    failedrecords = []
    for sentence in tqdm.tqdm(sentences):
        try:
            y_pred.append(Predict(models, sentence))
        except:
            y_pred.append("NaN")
            for model in models:
                failedrecords.append([sentence, model[3](sentence, model[4])])
    return y_pred, failedrecords

# Where the magic happens

In [7]:

df = pd.read_csv('data/Mapped.csv')
df.columns = ["Vendor", "Category", "Item", "Item Description", "Price", "Origin", 'Destination', "Rating", "Remarks", "randomshit"]
df['Category'] = df["Category"].apply(lambda d: d.split('/', 1)[0])
vectors, w2vmodel, w2vmodel2, fasttextmodel = Vectorize(df["Item"] + " " + df['Item Description'])
models = TrainModels(df, vectors, w2vmodel, w2vmodel2, fasttextmodel)


trainX, testX, trainY, testY = train_test_split(df["Item"] + " " + df['Item Description'], df['Category'], test_size=0.2, random_state=0)
trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2, random_state=0)
y_pred, failed = PredictMultiple(models, testX)

Preprocessing for Linear SVC
Done preprocessing, Vectorizing for Linear SVC
Preprocessing for Random Forest
Done preprocessing, Vectorizing for Random Forest
Preprocessing for Linear Discriminant
Done preprocessing, Vectorizing for Linear Discriminant
Preprocessing for Decision Tree
Done preprocessing, Vectorizing for Decision Tree
No preprocessing needed for Logistic Regression, Vectorizing for Logistic Regression
No preprocessing needed for Multinomial Naive Bayes, Vectorizing for Multinomial Naive Bayes
Done training model Linear SVC
Done training model Random Forest
Done training model Linear Discriminant
Done training model Decision Tree
Done training model Logistic Regression
Done training model Gaussian Naive Bayes
Testing multiple models


100%|██████████| 22115/22115 [23:21<00:00, 15.78it/s]


In [14]:
precision_recall_fscore_support(testY.values, y_pred, average='micro')

(0.9494912955007914, 0.9494912955007914, 0.9494912955007914, None)