In [8]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.tag.crf import CRFTagger
from nltk.corpus import brown
import spacy
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
%pip install python-crfsuite

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Data Loading

In [10]:
col_names = ['questions', 'a', 'b']
data_df = pd.read_csv("https://raw.githubusercontent.com/VIthulan/travel-text-classification/master/data/5000TravelQuestionsDataset.csv", error_bad_lines=False,header=None, names=col_names, encoding='latin-1')


In [11]:
data_df['questions']

0       What are the special things we (husband and me...
1       What are the companies which organize shark fe...
2       Is it safe for female traveller to go alone to...
3       What are the best places around Cape Town for ...
4       What are the best places to stay for a family ...
                              ...                        
4995    What is the best area to be based for sightsee...
4996    What are the good value traditional bars and r...
4997       What are the hotels near Alicante bus station?
4998       Where to stay in La Gomera to mountain biking?
4999    Is it possible to take a train trip from Santi...
Name: questions, Length: 5000, dtype: object

# Pre Processing

## Stop word removal

In [12]:
stop_words = set(stopwords.words('english')) 

def remove_stopwords(text):
  word_tokens = nltk.word_tokenize(text) 
  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  return " ".join(filtered_sentence)

In [13]:
    # Remove all the special characters
data_df['processed_questions'] = data_df['questions'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_questions'] = data_df['processed_questions'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_questions'] = data_df['questions'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_questions'] = data_df['questions'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_questions'] = data_df['questions'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_questions'] = data_df['questions'].str.strip()
# Stop word removal
data_df['processed_questions'] = data_df.processed_questions.apply(remove_stopwords)


In [14]:
data_df['processed_questions']

0       What special things ( husband ) 5 day stay Cap...
1       What companies organize shark feeding events s...
2           Is safe female traveller go alone Cape Town ?
3              What best places around Cape Town safari ?
4       What best places stay family stay away nightli...
                              ...                        
4995             What best area based sightseeing Palma ?
4996    What good value traditional bars restaurants B...
4997              What hotels near Alicante bus station ?
4998               Where stay La Gomera mountain biking ?
4999        Is possible take train trip Santiago Madrid ?
Name: processed_questions, Length: 5000, dtype: object

## Lemmatizing

In [15]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lem = [lemmatizer.lemmatize(w, pos="v") for w in nltk.word_tokenize(text)]
  return " ".join(lem)

In [16]:
data_df["question_lemmatized"] = data_df.processed_questions.apply(lemmatize_text)

In [17]:
data_df["question_lemmatized"]

0       What special things ( husband ) 5 day stay Cap...
1       What company organize shark feed events scuba ...
2           Is safe female traveller go alone Cape Town ?
3               What best place around Cape Town safari ?
4       What best place stay family stay away nightlife ?
                              ...                        
4995                 What best area base sightsee Palma ?
4996    What good value traditional bar restaurants Ba...
4997              What hotels near Alicante bus station ?
4998                 Where stay La Gomera mountain bike ?
4999        Is possible take train trip Santiago Madrid ?
Name: question_lemmatized, Length: 5000, dtype: object

## POS Tag

In [18]:
def pos_tagger(text):
    pos_tagged = [ r[1] for r in pos_tag(nltk.word_tokenize(text))] 
    return ' '.join(pos_tagged)

In [19]:
data_df["question_pos_t"] = data_df.processed_questions.apply(pos_tagger)

In [20]:
# brown_tags = brown.tagged_sents(categories='hobbies')
#  crf = CRFTagger()
# crf.train(brown_tags,'model.crf.tagger')

In [21]:
data_df["question_pos_t"]

0       WP JJ NNS ( NN ) CD NN VB NNP NNP .
1            WP NNS VBP JJ VBG NNS JJ NNS .
2             VBZ JJ NN NN VBP RB NNP NNP .
3                WP JJS NNS IN NNP NNP NN .
4              WP JJS NNS VBP NN VB RB RB .
                       ...                 
4995                WP JJS NN VBN VBG NNP .
4996              WP JJ NN JJ NNS NNS NNP .
4997                  WP VBD IN NNP NN NN .
4998                 WRB NN NNP NNP NN NN .
4999              VBZ JJ NN NN NN NNP NNP .
Name: question_pos_t, Length: 5000, dtype: object

## Stop word removal

In [22]:
data_df["question_stop_w_removed"] = data_df.processed_questions.apply(remove_stopwords)

In [23]:
data_df["question_stop_w_removed"]

0       What special things ( husband ) 5 day stay Cap...
1       What companies organize shark feeding events s...
2           Is safe female traveller go alone Cape Town ?
3              What best places around Cape Town safari ?
4       What best places stay family stay away nightli...
                              ...                        
4995             What best area based sightseeing Palma ?
4996    What good value traditional bars restaurants B...
4997              What hotels near Alicante bus station ?
4998               Where stay La Gomera mountain biking ?
4999        Is possible take train trip Santiago Madrid ?
Name: question_stop_w_removed, Length: 5000, dtype: object

## Headword extraction

In [24]:
nlp = spacy.load("en_core_web_sm")
def head_word_tokenizer(text):
    head_words = []
    for token in nlp(text):
        if token.dep_ == "nsubj" or token.dep_ == "nsubjpass":
            head_words.append(token.text)
            head_words.append(token.head.text)
    unique_hw = list(set(head_words))
    return unique_hw

In [25]:
# head_words_vectorizer = CountVectorizer(tokenizer = head_word_tokenizer,max_features=100,stop_words=stopwords.words('english'))
# head_words_vector = head_words_vectorizer.fit_transform(data_df.question_lemmatized.values).toarray()

In [26]:
data_df["question_headwords"] = data_df.question_stop_w_removed.apply(head_word_tokenizer)

In [27]:
data_df["question_headwords"]

0               [stay, things]
1        [companies, organize]
2              [go, traveller]
3                           []
4       [family, stay, places]
                 ...          
4995                        []
4996       [restaurants, bars]
4997                        []
4998            [biking, stay]
4999                        []
Name: question_headwords, Length: 5000, dtype: object

In [28]:
data_df["question_headwords"]

0               [stay, things]
1        [companies, organize]
2              [go, traveller]
3                           []
4       [family, stay, places]
                 ...          
4995                        []
4996       [restaurants, bars]
4997                        []
4998            [biking, stay]
4999                        []
Name: question_headwords, Length: 5000, dtype: object

## Headword Synonyms

In [29]:
from nltk.corpus import wordnet
max_syns = 2
def wordnet_synonyms(keywords):
  synonyms = []
  for keyword in keywords:
    count = 0
    for synset in wordnet.synsets(keyword):
      if count <= max_syns:
        for lemma in synset.lemmas():
          if count <= max_syns:
            synonyms.append(lemma.name())
          else: 
            break
        count = count + 1
      else:
        break

  return synonyms

In [30]:
wordnet_synonyms(["mother", "father"])

['mother',
 'female_parent',
 'mother',
 'mother',
 'father',
 'male_parent',
 'begetter',
 'forefather',
 'father',
 'sire',
 'Father',
 'Padre']

In [31]:
data_df["question_hw_syn"] = data_df.question_headwords.apply(wordnet_synonyms)

In [32]:
data_df["question_hw_syn"]

0       [stay, arrest, check, halt, hitch, stay, stop,...
1       [company, company, company, companionship, fel...
2       [go, spell, tour, turn, Adam, ecstasy, XTC, go...
3                                                      []
4       [family, household, house, home, menage, famil...
                              ...                        
4995                                                   []
4996    [restaurant, eating_house, eating_place, eater...
4997                                                   []
4998    [bicycle, cycle, bike, pedal, wheel, stay, arr...
4999                                                   []
Name: question_hw_syn, Length: 5000, dtype: object

## Bag of Words
This will be added to the training model directly using countVector

# Vectorizer Methods

## TF IDF Vectorizer

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorize(text):
  tfidfconverter = TfidfVectorizer(max_features=1500, min_df=1, max_df=0.7, stop_words=stopwords.words('english'))
  X = tfidfconverter.fit_transform(text).toarray()
  return X


## Count Vectorizer

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
def count_vectorize(text):
  vectorizer = CountVectorizer(max_features=1500, min_df=1, max_df=0.7, stop_words=stopwords.words('english'))
  X = vectorizer.fit_transform(text).toarray()
  return X


## MultiLabel Binarizer

In [35]:
from sklearn.preprocessing import MultiLabelBinarizer

def multilabel_bin(text):
  mlb = MultiLabelBinarizer()
  X = mlb.fit_transform(text)
  return X

## Label Enconder

In [36]:
from sklearn.preprocessing import LabelEncoder
def label_encoder(text):
  le = LabelEncoder()
  y_encoded = le.fit_transform(text)

## Accuracy Calculator

In [62]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def print_report(y_test, y_pred):
    # Classification Report
    print(classification_report(y_test,y_pred))

    acc = accuracy_score(y_test, y_pred)*100
    print('Accuracy score: %.3f' % acc)

    print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confustion matrix: \n{}".format(cm))

# Feature Encoding

In [37]:
tfidf_vectorize(data_df['question_lemmatized']).shape

(5000, 1500)

In [38]:
count_vectorize(data_df["question_pos_t"]).shape

(5000, 25)

In [39]:
multilabel_bin(data_df["question_headwords"]).shape

(5000, 1664)

In [40]:
multilabel_bin(data_df["question_hw_syn"]).shape

(5000, 4217)

In [41]:
# data_df["lem_vec"] = data_df.question_lemmatized.apply(tfidf_vectorize)
# data_df["pos_vec"] = data_df.question_pos_t.apply(count_vectorize)
# data_df["head_vec"] = data_df.question_headwords.apply(multilabel_bin)
# data_df["head_vec_syn"] = data_df.question_hw_syn.apply(multilabel_bin)

In [42]:
data_df.head()

Unnamed: 0,questions,a,b,processed_questions,question_lemmatized,question_pos_t,question_stop_w_removed,question_headwords,question_hw_syn
0,What are the special things we (husband and me...,TTD,TTDSIG,What special things ( husband ) 5 day stay Cap...,What special things ( husband ) 5 day stay Cap...,WP JJ NNS ( NN ) CD NN VB NNP NNP .,What special things ( husband ) 5 day stay Cap...,"[stay, things]","[stay, arrest, check, halt, hitch, stay, stop,..."
1,What are the companies which organize shark fe...,TTD,TTDOTH,What companies organize shark feeding events s...,What company organize shark feed events scuba ...,WP NNS VBP JJ VBG NNS JJ NNS .,What companies organize shark feeding events s...,"[companies, organize]","[company, company, company, companionship, fel..."
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA,Is safe female traveller go alone Cape Town ?,Is safe female traveller go alone Cape Town ?,VBZ JJ NN NN VBP RB NNP NNP .,Is safe female traveller go alone Cape Town ?,"[go, traveller]","[go, spell, tour, turn, Adam, ecstasy, XTC, go..."
3,What are the best places around Cape Town for ...,TTD,TTDSIG,What best places around Cape Town safari ?,What best place around Cape Town safari ?,WP JJS NNS IN NNP NNP NN .,What best places around Cape Town safari ?,[],[]
4,What are the best places to stay for a family ...,ACM,ACMOTH,What best places stay family stay away nightli...,What best place stay family stay away nightlife ?,WP JJS NNS VBP NN VB RB RB .,What best places stay family stay away nightli...,"[family, stay, places]","[family, household, house, home, menage, famil..."


In [49]:
le = LabelEncoder()

x_encoded = np.concatenate((tfidf_vectorize(data_df['question_lemmatized']), count_vectorize(data_df["question_pos_t"]), multilabel_bin(data_df["question_headwords"]), multilabel_bin(data_df["question_hw_syn"])), axis=1)
y_encoded = le.fit_transform(data_df['a'])

In [50]:
x_encoded.shape

(5000, 7406)

In [51]:
y_encoded.shape

(5000,)

In [46]:
from scipy.sparse import csr_matrix
# create dense matrix
# convert to sparse matrix (CSR method)
S = csr_matrix(x_encoded)
print(S.shape)
print(S)
# reconstruct dense matrix
B = S.todense()
print(B.shape)
print(B)

(5000, 7406)
  (0, 221)	0.49658101113101494
  (0, 356)	0.33531294280382
  (0, 1255)	0.4808824383659787
  (0, 1273)	0.29338267835993315
  (0, 1338)	0.4016610704129838
  (0, 1366)	0.4028890651241106
  (0, 1501)	1.0
  (0, 1504)	1.0
  (0, 1509)	1.0
  (0, 1516)	1.0
  (0, 1523)	1.0
  (0, 2991)	1.0
  (0, 3052)	1.0
  (0, 3908)	1.0
  (0, 4338)	1.0
  (0, 5285)	1.0
  (0, 5356)	1.0
  (0, 6837)	1.0
  (0, 6858)	1.0
  (0, 6860)	1.0
  (0, 7025)	1.0
  (0, 7026)	1.0
  (1, 313)	0.4311613009762721
  (1, 454)	0.6455332180327034
  (1, 1178)	0.6303862284003282
  :	:
  (4998, 888)	0.6180078928060596
  (4998, 1273)	0.3223335822690538
  (4998, 1524)	1.0
  (4998, 2130)	1.0
  (4998, 2991)	1.0
  (4998, 3908)	1.0
  (4998, 4074)	1.0
  (4998, 4077)	1.0
  (4998, 4338)	1.0
  (4998, 4607)	1.0
  (4998, 5285)	1.0
  (4998, 5356)	1.0
  (4998, 6105)	1.0
  (4998, 6837)	1.0
  (4998, 6858)	1.0
  (4998, 6860)	1.0
  (4998, 7342)	1.0
  (4999, 802)	0.5047014417228678
  (4999, 1034)	0.3841878014180018
  (4999, 1165)	0.47182301907921

# Hyper paramter tuning

In [52]:
from sklearn.model_selection import GridSearchCV 
from sklearn import svm

param_grid = {'C': [1,2,3,4,5,6,7,8,9,10],  
              'gamma': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45], 
              'kernel': ['rbf', 'linear'],
              'class_weight': ["balanced"]}


In [53]:
# grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3, cv=3) 

# grid.fit(tfidf_vectorize(data_df['question_lemmatized'])[:1000], y_encoded[:1000]) 

Fitting 3 folds for each of 160 candidates, totalling 480 fits
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, score=0.431, total=   2.6s
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s


[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, score=0.294, total=   2.7s
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.4s remaining:    0.0s


[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, score=0.357, total=   2.7s
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=linear ............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=linear, score=0.686, total=   2.3s
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=linear ............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=linear, score=0.652, total=   2.3s
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=linear ............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=linear, score=0.703, total=   2.3s
[CV] C=1, class_weight=balanced, gamma=0.15, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.15, kernel=rbf, score=0.527, total=   2.6s
[CV] C=1, class_weight=balanced, gamma=0.15, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.15, kernel=rbf, score=0.459, total=   2.7s
[CV] C=1, class_weight=balanced, gamma=0.15, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.15, kernel=rbf, s

[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed: 19.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'class_weight': ['balanced'],
                         'gamma': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45],
                         'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [54]:
grid.best_estimator_

SVC(C=5, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.25, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Model training

**Question 1:** Following model uses following as features


1.   Lemmotized bag of words
2.   POS tags
3.   Headwords
4.   Headword synoynms
5.   Named entity



In [44]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(x_encoded):
    fold += 1
    X_train, X_test = x_encoded[train_index], x_encoded[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    print("Beginning fold: ", fold)
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM1 = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM1, y_test)*100
    accuracies.append(acc)
    print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Beginning fold:  1
K-Fold: 1 - Accuracy:  - 79.00
Beginning fold:  2
K-Fold: 2 - Accuracy:  - 78.60
Beginning fold:  3
K-Fold: 3 - Accuracy:  - 75.60
Beginning fold:  4
K-Fold: 4 - Accuracy:  - 79.60
Beginning fold:  5
K-Fold: 5 - Accuracy:  - 77.60
Beginning fold:  6
K-Fold: 6 - Accuracy:  - 78.80
Beginning fold:  7
K-Fold: 7 - Accuracy:  - 77.20
Beginning fold:  8
K-Fold: 8 - Accuracy:  - 77.20
Beginning fold:  9
K-Fold: 9 - Accuracy:  - 77.40
Beginning fold:  10
K-Fold: 10 - Accuracy:  - 77.00
Mean 77.80 Std 1.13


## Lemmatized word only as feature

In [52]:
x_bow_feature = tfidf_vectorize(data_df['question_lemmatized'])

In [53]:
x_bow_feature.shape

(5000, 1500)

In [54]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(x_bow_feature):
    fold += 1
    X_train, X_test = x_bow_feature[train_index], x_bow_feature[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    print("Beginning fold: ", fold)
    SVM = SVC(C=5, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.25, kernel='rbf')
    SVM.fit(X_train,y_train)
    y_predictions = SVM.predict(X_test)
    acc = accuracy_score(y_predictions, y_test)*100
    accuracies.append(acc)
    
    print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Beginning fold:  1
K-Fold: 1 - Accuracy:  - 81.80
Beginning fold:  2
K-Fold: 2 - Accuracy:  - 82.80
Beginning fold:  3
K-Fold: 3 - Accuracy:  - 79.00
Beginning fold:  4
K-Fold: 4 - Accuracy:  - 81.40
Beginning fold:  5
K-Fold: 5 - Accuracy:  - 80.80
Beginning fold:  6
K-Fold: 6 - Accuracy:  - 80.40
Beginning fold:  7
K-Fold: 7 - Accuracy:  - 82.80
Beginning fold:  8
K-Fold: 8 - Accuracy:  - 82.20
Beginning fold:  9
K-Fold: 9 - Accuracy:  - 81.80
Beginning fold:  10
K-Fold: 10 - Accuracy:  - 82.00
Mean 81.50 Std 1.11


In [63]:
print_report(y_test, y_predictions)

              precision    recall  f1-score   support

           1       0.86      0.82      0.84        76
           2       0.76      0.70      0.73        23
           3       0.93      0.83      0.88        60
           4       0.85      0.72      0.78       112
           6       0.83      0.93      0.87        98
           7       0.71      0.82      0.76       114
           9       1.00      0.94      0.97        17

    accuracy                           0.82       500
   macro avg       0.85      0.82      0.83       500
weighted avg       0.83      0.82      0.82       500

Accuracy score: 82.000
F1 Score: 0.820
Confustion matrix: 
[[62  0  1  2  1 10  0]
 [ 0 16  0  0  3  4  0]
 [ 3  0 50  4  1  2  0]
 [ 2  1  3 81  7 18  0]
 [ 1  0  0  2 91  4  0]
 [ 4  4  0  6  6 94  0]
 [ 0  0  0  0  1  0 16]]


## BoW + Headword Syn

In [48]:
x_encoded_bow_hw_syn = np.concatenate((tfidf_vectorize(data_df['question_lemmatized']), multilabel_bin(data_df["question_hw_syn"])), axis=1)
x_encoded_bow_hw_syn.shape

(5000, 5717)

In [49]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(x_encoded_bow_hw_syn):
    fold += 1
    X_train, X_test = x_encoded_bow_hw_syn[train_index], x_encoded_bow_hw_syn[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    print(X_train.shape)

    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM1 = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM1, y_test)*100
    accuracies.append(acc)
    
    print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

(4500, 5717)
K-Fold: 1 - Accuracy:  - 78.20
(4500, 5717)
K-Fold: 2 - Accuracy:  - 79.80
(4500, 5717)
K-Fold: 3 - Accuracy:  - 75.60
(4500, 5717)
K-Fold: 4 - Accuracy:  - 77.40
(4500, 5717)
K-Fold: 5 - Accuracy:  - 77.20
(4500, 5717)
K-Fold: 6 - Accuracy:  - 79.20
(4500, 5717)
K-Fold: 7 - Accuracy:  - 76.20
(4500, 5717)
K-Fold: 8 - Accuracy:  - 77.20
(4500, 5717)
K-Fold: 9 - Accuracy:  - 76.00
(4500, 5717)
K-Fold: 10 - Accuracy:  - 76.60
Mean 77.34 Std 1.30


# Summary



Lemmotized bag of words, POS tags, Headwords, Headword synoynms, Named entity provided the f1 score of 77.8% but I was able to get the maximum F1 score of 82% using Lemmotized bag of words as the only feature. Following are the metrics from that model:
```
                precision    recall  f1-score   support

           1       0.86      0.82      0.84        76
           2       0.76      0.70      0.73        23
           3       0.93      0.83      0.88        60
           4       0.85      0.72      0.78       112
           6       0.83      0.93      0.87        98
           7       0.71      0.82      0.76       114
           9       1.00      0.94      0.97        17

    accuracy                           0.82       500
   macro avg       0.85      0.82      0.83       500
weighted avg       0.83      0.82      0.82       500
```

*   Accuracy score: 82.0
*   F1 Score: 82.0

Confustion matrix
```
[[62  0  1  2  1 10  0]
 [ 0 16  0  0  3  4  0]
 [ 3  0 50  4  1  2  0]
 [ 2  1  3 81  7 18  0]
 [ 1  0  0  2 91  4  0]
 [ 4  4  0  6  6 94  0]
 [ 0  0  0  0  1  0 16]]
 ```
