**imports**

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle

**Loading the training data along with the labels**

In [2]:
corpus = []
emotions = []
with open("data\corpus.txt") as corpus_file:
    for line in corpus_file:
        if line[-1]=='\n':
            line = line[:-1]
        corpus.append(line)

with open("data\labels.txt") as labels_file:
    for line in labels_file:
        if line[-1]=='\n':
            line = line[:-1]
        emotions.append(line)

print("Number of sentences in the corpus = ",len(corpus))
print("Number of emotion labels = ",len(emotions))

Number of sentences in the corpus =  2400
Number of emotion labels =  2400


**Label Based Encoding for the emotions**

In [3]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(emotions)

**Train test split**

In [4]:
x_train,x_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2,random_state=4)
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

1920
1920
480
480


**Hyper-Parameter Tuning for tf-idf vectorizer**

1. Creating the pipeline containing tfidf vectorizer and the SVM model

In [5]:
param_grid={
    "vectorizer__stop_words":[None,"english"],
    'vectorizer__ngram_range':[(1,1),(1,2),(2,2)],
    "model_svm__C":[0.6,0.8,1.0,1.4,1.8,2.2],
    'model_svm__kernel':['linear','rbf','sigmoid']
} 

tf_idf_pipeline = Pipeline([
    ('vectorizer',TfidfVectorizer()),
    ('model_svm',SVC())
])

2. Applying Grid Search on the combination

In [6]:
grid_search = GridSearchCV(tf_idf_pipeline, param_grid, cv=10,n_jobs=-1)
grid_search.fit(x_train, y_train)
optimalC = grid_search.best_params_["model_svm__C"]
optimal_kernel = grid_search.best_params_["model_svm__kernel"]
optimal_ngram = grid_search.best_params_["vectorizer__ngram_range"]
optimal_stop_words = grid_search.best_params_["vectorizer__stop_words"]
print("Optimal value of the regularization constant = ",optimalC)
print("Optimal value of the kernel = ",optimal_kernel)
print("Optimal value of ngram_range = ",optimal_ngram)
print("Optimal value of stop_words = ",optimal_stop_words)

Optimal value of the regularization constant =  1.4
Optimal value of the kernel =  sigmoid
Optimal value of ngram_range =  (1, 2)
Optimal value of stop_words =  english


**Using tf-idf vectorizer for vectorizing the samples**

In [7]:
vectorizer = TfidfVectorizer(ngram_range=optimal_ngram,stop_words=optimal_stop_words)
x_train =  vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print(x_train.shape)
print(x_test.shape)

(1920, 16800)
(480, 16800)


**Fitting the SVM Model**

In [8]:
svm_classifier = SVC(C=optimalC,kernel=optimal_kernel,max_iter=-1)
svm_classifier.fit(x_train,y_train)

**Getting the predictions of the model**

In [9]:
y_train_pred = svm_classifier.predict(x_train)
y_test_pred = svm_classifier.predict(x_test)

**Printing the accuracy scores**

In [10]:
print("Training data metrics : ")
print(accuracy_score(y_train_pred,y_train))

print("Testing data accuracy")
print(accuracy_score(y_test,y_test_pred))

Training data metrics : 
0.9942708333333333
Testing data accuracy
0.825


**Saving the original model**

In [11]:
pickle.dump((svm_classifier,label_encoder,vectorizer),open('model.pk1','wb'))