In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Question_Classification_Dataset.csv")

In [39]:
df.shape

(5452, 5)

In [5]:
print(df['Category2'].value_counts())

ind          962
other        733
def          421
count        363
desc         321
manner       276
date         218
cremat       207
reason       191
gr           189
country      155
city         129
animal       112
food         103
dismed       103
termeq        93
period        75
money         71
exp           70
state         66
sport         62
event         56
product       42
substance     41
color         40
techmeth      38
dist          34
perc          27
veh           27
word          26
title         25
mount         21
body          16
abb           16
lang          16
plant         13
volsize       13
weight        11
symbol        11
instru        10
code           9
letter         9
speed          9
temp           8
ord            6
currency       4
religion       4
Name: Category2, dtype: int64


In [7]:
from nltk.tokenize import word_tokenize

In [9]:
df['Questions'] = [word_tokenize(sentence) for sentence in df.Questions]

In [13]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [14]:
def remove_stop_words(tokenized_list):
    text = [word for word in tokenized_list if word not in stop_words]
    return text

In [15]:
df['Questions'] = df['Questions'].apply(lambda x: remove_stop_words(x))

In [21]:
def remove_non_alpha(tokenized_list):
    text = [word for word in tokenized_list if word.isalpha()]
    return text

In [22]:
df['Questions'] = df['Questions'].apply(lambda x: remove_non_alpha(x))

In [23]:
from nltk.stem import WordNetLemmatizer

In [24]:
lemmatizer = WordNetLemmatizer()

In [25]:
from nltk import pos_tag

In [26]:
from nltk.corpus import wordnet

In [27]:
from collections import defaultdict

In [28]:
tag_map = defaultdict(lambda:wordnet.NOUN)

In [29]:
tag_map['J'] = wordnet.ADJ

In [30]:
tag_map['V'] = wordnet.VERB

In [31]:
tag_map['R'] = wordnet.ADV

In [32]:
def lemmatize(tokenized_list):
    new_list = []
    for (word, tag) in pos_tag(tokenized_list):
        text = lemmatizer.lemmatize(word, tag_map[tag[0]])
        new_list.append(text)
    return new_list

In [33]:
df['Questions'] = df['Questions'].apply(lambda x:str(lemmatize(x)))

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
vectorizer = TfidfVectorizer() 

In [37]:
vectorizer.fit(df['Questions'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [38]:
from sklearn.model_selection import train_test_split

In [40]:
feature_train, feature_test,label_train,label_test= train_test_split(df['Questions'], df['Category2'], test_size=0.2, random_state=1)

In [41]:
from sklearn.preprocessing import LabelEncoder

In [42]:
encoder = LabelEncoder()

In [43]:
encoder.fit_transform(label_train)
encoder.fit_transform(label_test)

array([11, 22, 28, ..., 18,  5,  6])

In [44]:
feature_train_vect = vectorizer.transform(feature_train)
feature_test_vect = vectorizer.transform(feature_test)

In [45]:
print(feature_test_vect[0])

  (0, 6829)	0.26381341984430656
  (0, 1467)	0.6441426266646197
  (0, 1172)	0.7179712779935254


In [46]:
from sklearn.naive_bayes import MultinomialNB

In [56]:
model = MultinomialNB(alpha=0.1, fit_prior=False)

In [57]:
model.fit(feature_train_vect, label_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=False)

In [143]:
model.score(feature_test_vect, label_test) 

0.5948670944087993

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [145]:
model2 = RandomForestClassifier(n_estimators=600, random_state=4)

In [146]:
model2.fit(feature_train_vect, label_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=4, verbose=0,
                       warm_start=False)

In [147]:
model2.score(feature_test_vect, label_test) 

0.7424381301558204

In [78]:
y_pred=model2.predict(feature_test_vect)
print(y_pred)

['ind' 'manner' 'ind' ... 'ind' 'count' 'country']


In [87]:
from sklearn.neural_network import MLPClassifier

In [99]:
model3 = MLPClassifier(alpha=0.3)

In [100]:
model3.fit(feature_train_vect, label_train)

MLPClassifier(activation='relu', alpha=0.3, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [102]:
model3.score(feature_test_vect, label_test)

0.7259395050412466

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

In [95]:
model4 = GradientBoostingClassifier()

In [96]:
model4.fit(feature_train_vect, label_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [98]:
model4.score(feature_test_vect, label_test)

0.7131072410632447

In [106]:
from sklearn.tree import DecisionTreeClassifier

In [136]:
model5 = DecisionTreeClassifier()


In [137]:
model5.fit(feature_train_vect, label_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [138]:
model5.score(feature_test_vect, label_test)

0.692025664527956

In [139]:
from sklearn.svm import SVC

In [140]:
model6 = SVC()

In [141]:
model6.fit(feature_train_vect, label_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [142]:
model5.score(feature_test_vect, label_test)

0.692025664527956

In [149]:
y_pred = model2.predict(feature_test_vect)

In [150]:
results = pd.DataFrame({'Actual':label_test, 'Predicted':y_pred})

In [151]:
print(results)

        Actual Predicted
2457      desc       ind
763     manner    manner
2993    period       ind
3909     state     state
724       city      city
2303       ind       ind
645        ind       ind
4670    manner    manner
4487       ind       ind
1125    reason    reason
1753    dismed    dismed
4048      dist      dist
5023     other       ind
4838       ind       ind
792        ind       ind
5344       ind       ind
4820       ind       ind
3900       ind    cremat
200        ind       ind
1528       def       def
958      count     count
179       city      city
5448  currency     other
662      count     count
3162   country   country
850        ind       ind
4390     count     count
2946        gr        gr
1403     other       ind
2363    cremat      city
...        ...       ...
2636    manner    manner
5030     money       ind
4152       ord       ord
3078       ind     other
1865     count     count
2233       ind       ind
3564       def       def
4566      date      date
