Stochastic Gradient Descent Classifier

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_data=pd.read_csv("W22_P2_train.csv")
test_data=pd.read_csv("W22_P2_test.csv")

In [3]:
from nltk.stem import PorterStemmer
#function for preprocessing the list of description
def preprocess(txt):
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    remove_digits = str.maketrans('', '', string.digits)
    p = re.compile("[" + re.escape(string.punctuation) + "]")
    ps = PorterStemmer()
    txt = txt.lower()
    txt = BAD_SYMBOLS_RE.sub('', txt)
    txt = txt.translate(remove_digits)
    txt = p.sub("", txt)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(txt)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            w = ps.stem(w)
            filtered_sentence.append(w)        
    return " ".join(filtered_sentence)

In [4]:
#Create a list of preprocessed descriptions from train data
preprocessed_description=[]
for i in range(0,len(train_data)):
    description=preprocess(train_data["description"][i])
    preprocessed_description.append(description)

In [5]:
#Create a list of preprocessed descriptions from test data
preprocessed_test_description=[]
for i in range(0,len(test_data)):
    description=preprocess(test_data["description"][i])
    preprocessed_test_description.append(description)

In [6]:
#Vectorize train data
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(min_df=10,ngram_range=(1,3),max_features=20000)
x=tf.fit_transform(preprocessed_description)

In [7]:
#Encode labels
y=train_data['genre']
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
params = {
    "estimator__loss" : ["hinge", "log", "modified_huber"],
    "estimator__alpha" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]
}

clf = SGDClassifier(max_iter=2000)
onevsrest = OneVsRestClassifier(clf)
model = GridSearchCV(onevsrest, param_grid=params,verbose=10,cv=3)
n_scores = cross_val_score(model, x, y, scoring='accuracy', n_jobs=-1, error_score='raise')

In [9]:
# evaluate model for classification
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.650 (0.011)


In [10]:
model.fit(x,y)
model.best_params_

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3; 1/18] START estimator__alpha=1e-06, estimator__loss=hinge..............
[CV 1/3; 1/18] END estimator__alpha=1e-06, estimator__loss=hinge; total time=   0.0s
[CV 2/3; 1/18] START estimator__alpha=1e-06, estimator__loss=hinge..............
[CV 2/3; 1/18] END estimator__alpha=1e-06, estimator__loss=hinge; total time=   0.0s
[CV 3/3; 1/18] START estimator__alpha=1e-06, estimator__loss=hinge..............
[CV 3/3; 1/18] END estimator__alpha=1e-06, estimator__loss=hinge; total time=   0.0s
[CV 1/3; 2/18] START estimator__alpha=1e-06, estimator__loss=log................
[CV 1/3; 2/18] END estimator__alpha=1e-06, estimator__loss=log; total time=   0.0s
[CV 2/3; 2/18] START estimator__alpha=1e-06, estimator__loss=log................
[CV 2/3; 2/18] END estimator__alpha=1e-06, estimator__loss=log; total time=   0.0s
[CV 3/3; 2/18] START estimator__alpha=1e-06, estimator__loss=log................
[CV 3/3; 2/18] END estimator__al

[CV 2/3; 17/18] END estimator__alpha=0.1, estimator__loss=log; total time=   0.0s
[CV 3/3; 17/18] START estimator__alpha=0.1, estimator__loss=log.................
[CV 3/3; 17/18] END estimator__alpha=0.1, estimator__loss=log; total time=   0.0s
[CV 1/3; 18/18] START estimator__alpha=0.1, estimator__loss=modified_huber......
[CV 1/3; 18/18] END estimator__alpha=0.1, estimator__loss=modified_huber; total time=   0.0s
[CV 2/3; 18/18] START estimator__alpha=0.1, estimator__loss=modified_huber......
[CV 2/3; 18/18] END estimator__alpha=0.1, estimator__loss=modified_huber; total time=   0.0s
[CV 3/3; 18/18] START estimator__alpha=0.1, estimator__loss=modified_huber......
[CV 3/3; 18/18] END estimator__alpha=0.1, estimator__loss=modified_huber; total time=   0.0s


{'estimator__alpha': 0.0001, 'estimator__loss': 'log'}

In [11]:
#Vectorize test data
x_test=tf.transform(preprocessed_test_description)

In [12]:
pred=model.predict(x_test)

In [13]:
pred=le.inverse_transform(pred)

In [14]:
cid=list(test_data['id'])
genre=list(pred)

In [15]:
pred_file=test_data
pred_file['genre']=genre

In [16]:
pred_file[['id','genre']].to_csv("SGD_with_one_vs.csv",index=False)