Support Vector Classifier

In [1]:
import numpy as np
import pandas as pd
import json
import re
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_data=pd.read_csv("W22_P2_train.csv")
test_data=pd.read_csv("W22_P2_test.csv")

In [3]:
#function for preprocessing the list of description
def preprocess(txt):
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    remove_digits = str.maketrans('', '', string.digits)
    p = re.compile("[" + re.escape(string.punctuation) + "]")
    ps = PorterStemmer()
    txt = txt.lower()
    txt = BAD_SYMBOLS_RE.sub('', txt)
    txt = txt.translate(remove_digits)
    txt = p.sub("", txt)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(txt)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            w = ps.stem(w)
            filtered_sentence.append(w)        
    return " ".join(filtered_sentence)

In [4]:
#Create a list of preprocessed descriptions from train data
preprocessed_description=[]
for i in range(0,len(train_data)):
    description=preprocess(train_data["description"][i])
    preprocessed_description.append(description)

In [5]:
#Create a list of preprocessed descriptions from test data
preprocessed_test_description=[]
for i in range(0,len(test_data)):
    description=preprocess(test_data["description"][i])
    preprocessed_test_description.append(description)

In [6]:
#Vectorize train data
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
x=tf.fit_transform(preprocessed_description)

In [7]:
#Encode labels
y=train_data['genre']
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
classifier = SVC()

In [9]:
model = OneVsRestClassifier(classifier)

In [10]:
from sklearn.model_selection import cross_val_score
# parameters = {'estimator__C':[1, 10, 100, 1000]}
parameters = {'estimator__C':[10]}
gsmodel = GridSearchCV(model,parameters,cv=3)
n_scores = cross_val_score(gsmodel, x, y, scoring='accuracy', n_jobs=-1, error_score='raise')
# evaluate model for classification
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.642 (0.009)


In [11]:
best_model=gsmodel.fit(x,y)

In [12]:
print(best_model.best_params_)

{'estimator__C': 10}


In [13]:
#Vectorize test data
x_test=tf.transform(preprocessed_test_description)

In [14]:
pred=best_model.predict(x_test)

In [15]:
pred=le.inverse_transform(pred)

In [16]:
cid=list(test_data['id'])
genre=list(pred)

In [17]:
pred_file=test_data
pred_file['genre']=genre

In [18]:
pred_file[['id','genre']].to_csv("SVCwithOnevsRest_CV.csv",index=False)