In [1]:
import numpy as np
import pandas as pd

import nltk
import re
import gensim

import os

import time

import gc

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Questions,Category0,Category1,Category2
0,0,How did serfdom develop in and then leave Russ...,DESCRIPTION,DESC,manner
1,1,What films featured the character Popeye Doyle ?,ENTITY,ENTY,cremat
2,2,How can I find a list of celebrities ' real na...,DESCRIPTION,DESC,manner
3,3,What fowl grabs the spotlight after the Chines...,ENTITY,ENTY,animal
4,4,What is the full form of .com ?,ABBREVIATION,ABBR,exp


In [4]:
#check unique values

df['Category0'].value_counts()

ENTITY          1250
HUMAN           1223
DESCRIPTION     1162
NUMERIC          896
LOCATION         835
ABBREVIATION      86
Name: Category0, dtype: int64

In [5]:
df['Category1'].value_counts()

ENTY    1250
HUM     1223
DESC    1162
NUM      896
LOC      835
ABBR      86
Name: Category1, dtype: int64

In [6]:
df['Category2'].value_counts()

ind          962
other        733
def          421
count        363
desc         321
manner       276
date         218
cremat       207
reason       191
gr           189
country      155
city         129
animal       112
food         103
dismed       103
termeq        93
period        75
money         71
exp           70
state         66
sport         62
event         56
product       42
substance     41
color         40
techmeth      38
dist          34
perc          27
veh           27
word          26
title         25
mount         21
body          16
abb           16
lang          16
plant         13
volsize       13
symbol        11
weight        11
instru        10
code           9
letter         9
speed          9
temp           8
ord            6
currency       4
religion       4
Name: Category2, dtype: int64

In [8]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [9]:
df.shape

(5452, 5)

In [12]:
#map category labels
df['cat_label'] = pd.factorize(df['Category1'])[0]
df['topic_label'] = pd.factorize(df['Category2'])[0]

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Questions,Category0,Category1,Category2,cat_label,topic_label
0,0,How did serfdom develop in and then leave Russ...,DESCRIPTION,DESC,manner,0,0
1,1,What films featured the character Popeye Doyle ?,ENTITY,ENTY,cremat,1,1
2,2,How can I find a list of celebrities ' real na...,DESCRIPTION,DESC,manner,0,0
3,3,What fowl grabs the spotlight after the Chines...,ENTITY,ENTY,animal,1,2
4,4,What is the full form of .com ?,ABBREVIATION,ABBR,exp,2,3


In [53]:
#lets form feature vectors - using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score    
from sklearn.metrics import confusion_matrix as cm

In [33]:
from sklearn.model_selection import train_test_split

corpus = df['Questions'].apply(lambda x : str(x).lower()).tolist()

y1 = df['cat_label']   # task 1
y2 = df['topic_label'] # task 2

In [43]:
vectorizer = TfidfVectorizer(use_idf=True)

X = vectorizer.fit_transform(corpus)

In [49]:
x_tr, x_te, y_tr, y_te = train_test_split(X, y1, test_size = 0.25, random_state = 42)

In [44]:
task_df = pd.DataFrame(X[0].T.todense(),index=vectorizer.get_feature_names(), columns=["TF-IDF"])

In [52]:
#import classifiers

from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [85]:
def evaluate(clf, x_tr = x_tr, x_te = x_te, y_tr = y_tr, y_test = y_te):
    
    import seaborn as sns
    
    print('Classifier : {}'.format(clf))
    clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    
    acc = accuracy_score(y_pred, y_test)
    prc = precision_score(y_pred, y_test, average = None)
    rec = recall_score(y_pred, y_test, average = None)
    con = cm(y_test, y_pred)
    
    print('Metrics : \n')
    print('Accuracy : {:.2f}%\nPrecision : {}\nRecall : {}'.format(acc*100,prc*100,rec*100))
    print('Confusion Matrix : \n{}'.format(con))
    
    print('*'*100)
    print('\n\n')
    
    return clf

In [86]:
svm = SVC(decision_function_shape='ovo') #one vs one
lin_svm = LinearSVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators = 150)

# Task 1 - Predict Category

In [87]:
evaluate(svm)
evaluate(lin_svm)
evaluate(dt)
evaluate(rf)


Classifier : SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Metrics : 

Accuracy : 80.85%
Precision : [77.36486486 90.4109589  59.25925926 75.         86.72566372 77.61904762]
Recall : [ 84.81481481  59.86394558 100.          92.85714286  94.68599034
  92.09039548]
Confusion Matrix : 
[[229  48   0   3   8   8]
 [ 10 264   0  11   2   5]
 [  9   2  16   0   0   0]
 [  7  70   0 234   1   0]
 [ 11  17   0   1 196   1]
 [  4  40   0   3   0 163]]
****************************************************************************************************



Classifier : LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
M

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [89]:
#linear SVC works better. So lets save this model for processing

svm_model = evaluate(lin_svm)

Classifier : LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Metrics : 

Accuracy : 83.49%
Precision : [80.40540541 78.76712329 66.66666667 83.01282051 91.15044248 89.04761905]
Recall : [ 85.          72.78481013 100.          85.47854785  89.95633188
  86.17511521]
Confusion Matrix : 
[[238  29   0   9  10  10]
 [ 16 230   0  25   7  14]
 [  6   1  18   2   0   0]
 [  8  39   0 259   2   4]
 [ 10   4   0   4 206   2]
 [  2  13   0   4   4 187]]
****************************************************************************************************





In [92]:
# try tuning parameters

from sklearn.model_selection import GridSearchCV

params = {'C': [0.1, 1, 10, 100, 1000]}

grid = GridSearchCV(LinearSVC(), params, refit=True, verbose=3)

grid.fit(x_tr, y_tr)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.793, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.800, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.767, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.796, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.796, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.835, total=   0.0s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ................................. C=1, score=0.826, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.806, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.807, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.825, total=   0.0s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.825, total=   0.2s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.815, total=   0.2s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.801, total=   0.2s
[CV] C=10 ............................................................
[CV] .



[CV] ............................... C=100, score=0.808, total=   0.7s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.796, total=   0.6s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.787, total=   0.6s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.787, total=   0.6s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.799, total=   0.7s
[CV] C=1000 ..........................................................
[CV] .............................. C=1000, score=0.804, total=   0.6s
[CV] C=1000 ..........................................................
[CV] .............................. C=1000, score=0.795, total=   0.6s
[CV] C=1000 ..........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    7.6s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=3)

In [98]:
y_pred = grid.predict(x_te)
print('Accuracy : {:.2f}%'.format(100*accuracy_score(y_pred, y_te)))
print('Precision : \n{}'.format(100*precision_score(y_pred, y_te, average = None)))


Accuracy : 83.49%
Precision : 
[80.40540541 78.76712329 66.66666667 83.01282051 91.15044248 89.04761905]


In [81]:
def visualize_cm_binary(con, Labels):
    
    import seaborn as sns
    
#     Labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']

    Labels = np.asarray(Labels).reshape(2,2)
    sns.heatmap(con/np.sum(con), cmap='Blues')

# Task 2 - Predict Topic

In [118]:
x_tr, x_te, y_tr_2, y_te_2 = train_test_split(X, y2, test_size = 0.25, random_state = 42)

In [123]:
# build models for this

topic_svm = SVC(decision_function_shape='ovo')
topic_linsvm = LinearSVC()
topic_dt = DecisionTreeClassifier()
topic_rf = RandomForestClassifier(n_estimators=150)

evaluate(topic_svm, y_tr = y_tr_2, y_test = y_te_2) #labels are different now, same feature set though
evaluate(topic_linsvm, y_tr = y_tr_2, y_test = y_te_2) 
evaluate(topic_dt, y_tr = y_tr_2, y_test = y_te_2)
evaluate(topic_rf, y_tr = y_tr_2, y_test = y_te_2)


Classifier : SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Metrics : 

Accuracy : 66.54%
Precision : [ 93.58974359  13.46153846  26.08695652  63.63636364  95.
  25.92592593  25.          77.31958763  86.95652174  61.70212766
  16.66666667  61.11111111  32.95454545  95.12195122  86.44067797
   0.           0.          21.42857143  86.84210526  50.
  18.18181818  70.73170732   0.          61.53846154  42.85714286
  57.14285714  11.11111111  68.75         0.          16.66666667
   0.           0.           0.           0.          40.
  50.          14.28571429   0.           0.           0.
  33.33333333   0.           0.         100.           0.
  33.33333333]
Recall : [ 83.90804598 100.          85.71428571  93.33333333  59.84251969
  93.33333333 100.          70.09345794  96.77419355 100.
  66.66666667  91.66666667  90.625       98.73417722  39.84375
   0.           0.          85.71428571  89.18918919  83.33333333
  80.          93.5483871    0.          94.11764706 100.
 100.          50.          78.57142857   0.         100.
   0.      

  _warn_prf(average, modifier, msg_start, len(result))


Metrics : 

Accuracy : 66.91%
Precision : [ 94.87179487  46.15384615  47.82608696  59.09090909  80.83333333
  29.62962963  50.          74.22680412  84.05797101  72.34042553
  25.          83.33333333  54.54545455  95.12195122  62.14689266
   0.           0.          25.          92.10526316  90.
  27.27272727  75.6097561    0.          61.53846154  28.57142857
  71.42857143  22.22222222 100.          25.          41.66666667
   0.           0.          60.          66.66666667   0.
 100.          28.57142857  33.33333333  25.           0.
  77.77777778   0.           0.         100.           0.
   0.        ]
Recall : [ 94.87179487  54.54545455  47.82608696  76.47058824  67.59581882
  48.48484848  50.          72.          87.87878788  64.1509434
  23.07692308  75.          60.          90.69767442  54.72636816
   0.           0.          53.84615385  83.33333333 100.
  26.08695652  79.48717949   0.          80.         100.
  45.45454545  28.57142857  88.88888889  33.33333333  50.
 

  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [126]:
# LinearSVM has the best accuracy

topic_params = {'C': [0.1, 1, 10, 100, 1000]}

topic_grid = GridSearchCV(LinearSVC(), topic_params, refit = True, verbose = 3)

topic_grid.fit(x_tr , y_tr_2)
y_pred = topic_grid.predict(x_te)

print('Accuracy : {:.2f}%'.format(100*accuracy_score(y_pred, y_te_2)))
print('Precision :\n{}'.format(100*precision_score(y_pred, y_te_2, average = None)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.692, total=   0.1s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ............................... C=0.1, score=0.685, total=   0.1s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.674, total=   0.1s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ............................... C=0.1, score=0.678, total=   0.1s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.695, total=   0.1s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.752, total=   0.2s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.749, total=   0.2s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.763, total=   0.2s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.760, total=   0.2s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.770, total=   0.2s
[CV] C=10 ............................................................
[CV] .



[CV] ............................... C=100, score=0.736, total=   1.2s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.726, total=   1.2s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.731, total=   1.1s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.736, total=   1.2s
[CV] C=100 ...........................................................
[CV] ............................... C=100, score=0.765, total=   1.2s
[CV] C=1000 ..........................................................
[CV] .............................. C=1000, score=0.729, total=   1.1s
[CV] C=1000 ..........................................................
[CV] .............................. C=1000, score=0.726, total=   1.0s
[CV] C=1000 ..........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   14.5s finished
  _warn_prf(average, modifier, msg_start, len(result))


# Save Relevant Files for a simple Prediction API

In [102]:
#save the category model

import pickle

filename = 'Question_Classification_LinearSVM_model.pkl'
pickle.dump(grid, open(filename, 'wb'))

In [127]:
#save the topic model

topic_filename = 'Question_Classification_LinearSVM_topic_model.pkl'
pickle.dump(topic_grid, open(topic_filename, 'wb'))

In [103]:
#save vectorizer

vector_filename = 'Question_Classification_vectorizer.pkl'
pickle.dump(vectorizer, open(vector_filename, 'wb'))

In [104]:
# load category model and check

tuned_model = pickle.load(open(filename, 'rb'))
result = tuned_model.score(x_te, y_te)*100

print('Accuracy : {:.2f}%'.format(result)) #accuracy is preserved from the model

Accuracy : 83.49%


In [128]:
# load topic model and check

tuned_topic_model = pickle.load(open(topic_filename, 'rb'))
result = tuned_topic_model.score(x_te, y_te_2)*100

print('Accuracy : {:.2f}%'.format(result)) #accuracy is preserved from the model

Accuracy : 77.70%


In [110]:
# load vectorizer and check

vectorizer = pickle.load(open(vector_filename, 'rb'))
test_sent = vectorizer.transform(['This is nice'])

In [111]:
test_sent.shape #shape is preserved from the corpus.

(1, 8412)

In [114]:
#map category to label

cat_label_dict, topic_label_dict = {},{}

for val in df['Category1'].unique():
    cat_label_dict[val] = df[df['Category1']==val]['cat_label'].unique()[0]
    
for val in df['Category2'].unique():
    topic_label_dict[val] = df[df['Category2']==val]['topic_label'].unique()[0]

In [115]:
cat_label_dict, topic_label_dict

({'ABBR': 2, 'DESC': 0, 'ENTY': 1, 'HUM': 3, 'LOC': 5, 'NUM': 4},
 {'abb': 34,
  'animal': 2,
  'body': 22,
  'city': 21,
  'code': 39,
  'color': 19,
  'count': 13,
  'country': 18,
  'cremat': 1,
  'currency': 46,
  'date': 8,
  'def': 7,
  'desc': 12,
  'dismed': 23,
  'dist': 40,
  'event': 10,
  'exp': 3,
  'food': 17,
  'gr': 5,
  'ind': 4,
  'instru': 33,
  'lang': 37,
  'letter': 15,
  'manner': 0,
  'money': 25,
  'mount': 24,
  'ord': 43,
  'other': 14,
  'perc': 38,
  'period': 27,
  'plant': 30,
  'product': 26,
  'reason': 9,
  'religion': 16,
  'speed': 35,
  'sport': 29,
  'state': 11,
  'substance': 28,
  'symbol': 42,
  'techmeth': 31,
  'temp': 41,
  'termeq': 20,
  'title': 6,
  'veh': 44,
  'volsize': 32,
  'weight': 45,
  'word': 36})

In [117]:
#save class names to disk as well

with open('category_labels.txt', 'w') as f:
    f.write(str(cat_label_dict))

with open('topic_labels.txt', 'w') as f:
    f.write(str(topic_label_dict))