**imports**

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pickle

**Loading the training data along with the labels**

In [14]:
corpus = []
emotions = []
with open("data\corpus.txt") as corpus_file:
    for line in corpus_file:
        if line[-1]=='\n':
            line = line[:-1]
        corpus.append(line)

with open("data\labels.txt") as labels_file:
    for line in labels_file:
        if line[-1]=='\n':
            line = line[:-1]
        emotions.append(line)

print("Number of sentences in the corpus = ",len(corpus))
print("Number of emotion labels = ",len(emotions))

Number of sentences in the corpus =  2400
Number of emotion labels =  2400


**Label Based Encoding for the emotions**

In [15]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(emotions)

**Train test split**

In [16]:
x_train,x_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2,random_state=4)
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

1920
1920
480
480


**Hyper-Parameter Tuning for tf-idf vectorizer**

1. Creating the pipeline using the SVM model for judging the performance of the tf-idf vectorizer

In [17]:
tf_idf_pipeline = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('model_svm',SVC())
])

2. Applying Grid Search on the vectorizer

In [18]:
param_grid={
    'max_df':[0.25,0.5],
    'min_df':[],
    'ngram_range':[(1,1),(1,2),(2,2)],
    'C':[0.6,0.8,1.0,1.4,1.8,2.2],
    'kernel':['linear','rbf','sigmoid']
    
}

**Using tdf-idf vectorizer for vectorizing the samples**

In [19]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),stop_words='english')
x_train =  vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print(x_train.shape)
print(x_test.shape)

(1920, 4495)
(480, 4495)


**Using Grid Search CV for optimal Hyper-parameters**

In [20]:
param_grid={
    'C':[0.4,0.6,0.8,1.0,1.4,1.8,2.2,2],
    'kernel':['linear','rbf','sigmoid'],
}
svm_model = SVC()
grid_search =  GridSearchCV(svm_model,param_grid,cv=10,scoring='accuracy',n_jobs=-1)
grid_search.fit(x_train,y_train)
optimalC = grid_search.best_params_["C"]
optimal_kernel = grid_search.best_params_["kernel"]
print("Optimal value of the regularization constant = ",optimalC)
print("Optimal value of the kernel = ",optimal_kernel)

Optimal value of the regularization constant =  1.4
Optimal value of the kernel =  linear


**Fitting the SVM Model**

In [21]:
svm_classifier = SVC(C=optimalC,kernel=optimal_kernel,max_iter=-1)
svm_classifier.fit(x_train,y_train)

**Getting the predictions of the model**

In [22]:
y_train_pred = svm_classifier.predict(x_train)
y_test_pred = svm_classifier.predict(x_test)

**Printing the accuracy scores**

In [23]:
print("Training data metrics : ")
print(accuracy_score(y_train_pred,y_train))

print("Testing data accuracy")
print(accuracy_score(y_test,y_test_pred))

Training data metrics : 
0.9942708333333333
Testing data accuracy
0.8208333333333333


**Saving the original model**

In [24]:
pickle.dump((svm_classifier,label_encoder,vectorizer),open('model.pk1','wb'))