In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
import os
from sklearn.metrics import confusion_matrix

In [2]:
qn_df = pd.read_csv('Question_Classification_Dataset.csv')
qn_df = qn_df.iloc[:,1:]
qn_df.head()

Unnamed: 0,Questions,Category0,Category1,Category2
0,How did serfdom develop in and then leave Russ...,DESCRIPTION,DESC,manner
1,What films featured the character Popeye Doyle ?,ENTITY,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESCRIPTION,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTITY,ENTY,animal
4,What is the full form of .com ?,ABBREVIATION,ABBR,exp


In [3]:
qn_df1 = qn_df[['Questions', 'Category0']]
qn_df1.head()

Unnamed: 0,Questions,Category0
0,How did serfdom develop in and then leave Russ...,DESCRIPTION
1,What films featured the character Popeye Doyle ?,ENTITY
2,How can I find a list of celebrities ' real na...,DESCRIPTION
3,What fowl grabs the spotlight after the Chines...,ENTITY
4,What is the full form of .com ?,ABBREVIATION


In [4]:
qn_df1['Category0'].value_counts()

ENTITY          1250
HUMAN           1223
DESCRIPTION     1162
NUMERIC          896
LOCATION         835
ABBREVIATION      86
Name: Category0, dtype: int64

In [28]:
qn_df1.isnull().sum()



Questions           0
Category0           0
Category Vectors    0
dtype: int64

In [5]:
qn_df1['Category Vectors'] = pd.factorize(qn_df1['Category0'])[0]
qn_df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Questions,Category0,Category Vectors
0,How did serfdom develop in and then leave Russ...,DESCRIPTION,0
1,What films featured the character Popeye Doyle ?,ENTITY,1
2,How can I find a list of celebrities ' real na...,DESCRIPTION,0
3,What fowl grabs the spotlight after the Chines...,ENTITY,1
4,What is the full form of .com ?,ABBREVIATION,2


In [6]:
vect = TfidfVectorizer(ngram_range = (1,2)).fit(qn_df1['Questions'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(qn_df1['Questions'], qn_df1['Category Vectors'], test_size=0.2, random_state=0)


In [9]:
train_vector = vect.transform(X_train)

In [10]:
train_vector

<4361x32705 sparse matrix of type '<class 'numpy.float64'>'
	with 69039 stored elements in Compressed Sparse Row format>

In [11]:
test_vector = vect.transform(X_test)

In [12]:
test_vector

<1091x32705 sparse matrix of type '<class 'numpy.float64'>'
	with 17090 stored elements in Compressed Sparse Row format>

In [13]:
model1 = SVC(kernel='linear', probability = True)

In [14]:
model1.fit(train_vector, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [15]:
pred1 = model1.predict(test_vector)
pred1

array([4, 1, 5, ..., 3, 0, 1], dtype=int64)

In [32]:
confusion_matrix(y_test
                 , pred1)

array([[198,  23,   1,   4,   0,   1],
       [ 11, 221,   0,  15,   0,   5],
       [  8,   1,  14,   1,   0,   0],
       [  5,  29,   0, 199,   1,   1],
       [  7,  14,   0,   0, 157,   0],
       [  7,  24,   0,   5,   0, 139]], dtype=int64)

In [20]:
print (confusion_matrix(y_test, pred1))

[[198  23   1   4   0   1]
 [ 11 221   0  15   0   5]
 [  8   1  14   1   0   0]
 [  5  29   0 199   1   1]
 [  7  14   0   0 157   0]
 [  7  24   0   5   0 139]]


In [21]:
from sklearn.metrics import classification_report

In [22]:
print (classification_report(y_test, pred1)) 

              precision    recall  f1-score   support

           0       0.84      0.87      0.86       227
           1       0.71      0.88      0.78       252
           2       0.93      0.58      0.72        24
           3       0.89      0.85      0.87       235
           4       0.99      0.88      0.93       178
           5       0.95      0.79      0.87       175

    accuracy                           0.85      1091
   macro avg       0.89      0.81      0.84      1091
weighted avg       0.86      0.85      0.85      1091

