**imports**

In [66]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import pickle

**Loading the training data along with the labels**

In [67]:
corpus = []
emotions = []
with open("corpus.txt") as corpus_file:
    for line in corpus_file:
        if line[-1]=='\n':
            line = line[:-1]
        corpus.append(line)

with open("labels.txt") as labels_file:
    for line in labels_file:
        if line[-1]=='\n':
            line = line[:-1]
        emotions.append(line)

print("Number of sentences in the corpus = ",len(corpus))
print("Number of emotion labels = ",len(emotions))

Number of sentences in the corpus =  2400
Number of emotion labels =  2400


**Label Based Encoding for the emotions**

In [68]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(emotions)

**Train test split**

In [69]:
x_train,x_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2,random_state=4)
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

1920
1920
480
480


**Using tdf-idf vectorizer for vectorizing the samples**

In [70]:
vectorizer = TfidfVectorizer()
x_train =  vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print(x_train.shape)
print(x_test.shape)

(1920, 4756)
(480, 4756)


**Using Grid Search CV for optimal Hyper-parameters**

In [71]:
param_grid={
    'C':[0.6,0.8,1.0,1.4,1.8,2.2],
    'kernel':['linear','rbf','sigmoid'],
}
svm_model = SVC()
grid_search =  GridSearchCV(svm_model,param_grid,cv=5,scoring='accuracy',n_jobs=-1)
grid_search.fit(x_train,y_train)
optimalC = grid_search.best_params_["C"]
optimal_kernel = grid_search.best_params_["kernel"]
print("Optimal value of the regularization constant = ",optimalC)
print("Optimal value of the kernel = ",optimal_kernel)

Optimal value of the regularization constant =  1.8
Optimal value of the kernel =  sigmoid


**Fitting the SVM Model**

In [72]:
svm_classifier = SVC(C=optimalC,kernel=optimal_kernel,max_iter=-1)
svm_classifier.fit(x_train,y_train)

**Getting the predictions of the model**

In [73]:
y_train_pred = svm_classifier.predict(x_train)
y_test_pred = svm_classifier.predict(x_test)

**Printing the accuracy scores**

In [74]:
print("Training data metrics : ")
print(accuracy_score(y_train_pred,y_train))

print("Testing data accuracy")
print(accuracy_score(y_test,y_test_pred))

Training data metrics : 
0.9744791666666667
Testing data accuracy
0.7520833333333333


**Performing dimensionality reduction**

In [75]:
pca = PCA(n_components=1920)
x_train_dense = x_train.todense()
x_test_dense = x_test.todense()
x_train_2 = np.array(x_train_dense)
x_test_2 = np.array(x_test_dense)
x_train_2 = pca.fit_transform(x_train_2)
x_test_2 = pca.transform(x_test_2)
print(x_train_2.shape)
print(x_test_2.shape)

(1920, 1920)
(480, 1920)


**Grid Search**

In [83]:
param_grid={
    'C':[0.6,0.8,1.0,1.4,1.8,2.2],
    'kernel':['linear','rbf','sigmoid'],
}
svm_model = SVC()
grid_search =  GridSearchCV(svm_model,param_grid,cv=5,scoring='accuracy',n_jobs=-1)
grid_search.fit(x_train_2,y_train)
optimalC2 = grid_search.best_params_["C"]
optimal_kernel2 = grid_search.best_params_["kernel"]
print("Optimal value of the regularization constant = ",optimalC2)
print("Optimal value of the kernel = ",optimal_kernel2)

Optimal value of the regularization constant =  2.2
Optimal value of the kernel =  sigmoid


**Training the SVM model on the reduced dimensionality data**

In [84]:
svm_classifier_2 = SVC(C=optimalC2,kernel=optimal_kernel2,max_iter=-1)
svm_classifier_2.fit(x_train_2,y_train)

**Getting the predictions of the model**

In [85]:
y_train_pred_2 = svm_classifier_2.predict(x_train_2)
y_test_pred_2 = svm_classifier_2.predict(x_test_2)

**Printing the accuracy scores of the model**

In [86]:
print("Training data metrics : ")
print(accuracy_score(y_train_pred_2,y_train))

print("Testing data accuracy")
print(accuracy_score(y_test,y_test_pred_2))

Training data metrics : 
0.9833333333333333
Testing data accuracy
0.7625


**Saving the model**

**Saving the original model**

In [81]:
pickle.dump(svm_classifier,open('model.pk1','wb'))

**Saving the model with PCA applied**

In [82]:
pickle.dump(svm_classifier_2,open('model_PCA.pk1','wb'))

Regularization Stength:

Check Between : 1 to 2.5

Kernel : Choose one between linear,rbf and sigmoid

Best Performance till now with C = 2.2 and kernel = sigmoid