## Text Classification using SVM - Linear

### Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

### Setting a random seed so that everyone can reproduce the same

In [2]:
np.random.seed(100)

### Read the corpus. This dataset has been picked from Amazon review dataset

In [3]:
df = pd.read_csv("corpus.csv.txt", encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2


### Data-preprocessing
- Remove redundant values
- Convert the data into something which is consistent
- Tokenization
- Performing lemmatization so that we know the context
- Encode Labels so that we are hitting the correct values

In [5]:
# remove all empty values
df['text'].dropna(inplace=True)

In [6]:
# update values to all lowercase
df['text'] = df['text'].map(lambda x: x.lower())

In [7]:
df.head()

Unnamed: 0,text,label
0,stuning even for the non-gamer: this sound tr...,__label__2
1,the best soundtrack ever to anything.: i'm re...,__label__2
2,amazing!: this soundtrack is my favorite musi...,__label__2
3,excellent soundtrack: i truly like this sound...,__label__2
4,"remember, pull your jaw off the floor after h...",__label__2


In [8]:
# Perform tokenization to break the sentence into a set of words for better understanding
df['text'] = df['text'].map(word_tokenize)

In [9]:
df.head()

Unnamed: 0,text,label
0,"[stuning, even, for, the, non-gamer, :, this, ...",__label__2
1,"[the, best, soundtrack, ever, to, anything, .,...",__label__2
2,"[amazing, !, :, this, soundtrack, is, my, favo...",__label__2
3,"[excellent, soundtrack, :, i, truly, like, thi...",__label__2
4,"[remember, ,, pull, your, jaw, off, the, floor...",__label__2


In [10]:
# Remove stop words and perform lemmatization and stemming

In [11]:
# WordNetLemmatizer requires Pos tags to understand the word. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

tag_map

defaultdict(<function __main__.<lambda>()>, {'J': 'a', 'V': 'v', 'R': 'r'})

In [12]:
# Convert stopwords to a set, for faster computation
set_of_stopwords = set(stopwords.words('english'))

In [34]:
# Need to call for-loop as the word_lemmatized needs to be reinitialized for each element
for index, val in enumerate(df['text']):
    # Initializing WordNetLemmatizer()
    word_lemmatized = WordNetLemmatizer()
    # call pos_tag on list of words and check if it exists in the stopwords; if not then lemmatize it.
    # No need to stem it as we need the context here.
    df.loc[index, 'final_text'] = str([word_lemmatized.lemmatize(word, tag_map[tag[0]])\
                                       for word, tag in pos_tag(val)\
                                       if word not in set_of_stopwords and word.isalpha()])

In [35]:
# Encode Labels
Encoder = LabelEncoder()
df['label'] = Encoder.fit_transform(df['label'])

In [36]:
df.head()

Unnamed: 0,text,label,final_text
0,"[stuning, even, for, the, non-gamer, :, this, ...",1,"['stun', 'even', 'sound', 'track', 'beautiful'..."
1,"[the, best, soundtrack, ever, to, anything, .,...",1,"['best', 'soundtrack', 'ever', 'anything', 're..."
2,"[amazing, !, :, this, soundtrack, is, my, favo...",1,"['amaze', 'soundtrack', 'favorite', 'music', '..."
3,"[excellent, soundtrack, :, i, truly, like, thi...",1,"['excellent', 'soundtrack', 'truly', 'like', '..."
4,"[remember, ,, pull, your, jaw, off, the, floor...",1,"['remember', 'pull', 'jaw', 'floor', 'hear', '..."


In [37]:
df.label.value_counts()

0    5097
1    4903
Name: label, dtype: int64

### Splitting the data (Preparing for SVM)

In [39]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['final_text'], df['label'], test_size=0.3)

In [40]:
X_test

6121    ['flawless', 'work', 'like', 'charm', 'england...
2201    ['great', 'place', 'start', 'anyone', 'want', ...
9737    ['good', 'dvd', 'subtitle', 'brazilian', 'trai...
2983    ['horrible', 'film', 'horrible', 'film', 'avoi...
1063    ['great', 'swing', 'reviewer', 'put', 'robilla...
                              ...                        
3438    ['man', 'perspective', 'soon', 'begin', 'proce...
2195    ['best', 'book', 'well', 'first', 'think', 'ba...
6313    ['worthy', 'several', 'academy', 'nomination',...
4677    ['king', 'rat', 'book', 'ok', 'dissapointed', ...
9072    ['great', 'voice', 'matt', 'monro', 'great', '...
Name: final_text, Length: 3000, dtype: object

### Perform vectorization TF-IDF

In [41]:
# Word vectorization : Going to use TF-IDF (term frequencyâ€“inverse document frequency)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['final_text'])
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

In [46]:
Train_X_Tfidf.shape, Train_X_Tfidf[0], y_train, 

((7000, 5000), <1x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 9 stored elements in Compressed Sparse Row format>, 5487    1
 884     0
 7646    0
 6687    0
 9894    1
        ..
 6035    0
 8549    1
 885     0
 7551    1
 4718    1
 Name: label, Length: 7000, dtype: int64)

### Perform predictions through SVM

In [55]:
# Training the model - Using SVM
SVM = svm.SVC(C=0.4, kernel='linear', degree=4, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)

SVC(C=0.4, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=4, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [56]:
# Perform predictions
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  85.16666666666667


### things not to do

In [60]:
# Testing with rbf (an incorrect place to use this)
SVM = svm.SVC(C=0.4, kernel='rbf', degree=1, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  49.666666666666664
