In [26]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import pickle
import torch
import torch.nn as nn

import warnings
warnings.filterwarnings("ignore")

In [2]:
x_train = pd.read_csv('../data/x_train.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_train = y_train['Genre'].values

print(x_train.shape)

(54214, 384)


In [3]:
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.100113,0.047543,-0.017568,0.022169,-0.008113,0.066285,0.044294,-0.001988,0.023515,-0.096185,...,0.01754,-0.043811,-0.008418,0.009575,0.004424,0.008141,0.030389,0.016892,0.053477,0.006697
1,-0.030218,0.053955,-0.115293,-0.023189,-0.023016,-0.004647,0.005791,-0.061413,0.050253,-0.013987,...,0.017131,-0.097131,0.016678,0.09024,-0.049587,0.036313,0.082832,-0.045983,0.112446,-0.038247
2,0.055413,0.067351,0.057936,0.040602,0.043434,-0.006236,0.001159,-0.051813,0.067461,-0.027024,...,0.019969,-0.039349,0.003698,0.012504,-0.134172,0.041827,-0.004911,0.070233,-0.081329,-0.061537
3,-0.051497,-0.011621,0.006114,-0.038689,-0.042185,0.065028,0.019505,0.038383,0.037409,-0.109517,...,0.027292,-0.039455,-0.035343,0.039774,-0.003396,0.129379,-0.002248,0.027673,0.096751,-0.03226
4,-0.044385,-0.024631,-0.00233,0.082546,0.054457,-0.025413,0.047433,-0.054462,-0.003557,-0.085276,...,0.063805,0.055537,-0.010916,0.055499,-0.081727,0.052276,0.066995,-0.018595,-0.091746,-0.009528


## Multinomial Naive Bayes

In [24]:
scaler = MinMaxScaler() 
scaler.fit(x_train)
x_t = scaler.transform(x_train)

NB = MultinomialNB(alpha=0.9)
NB.fit(x_t,y_train)

print("Training Accuracy: ",NB.score(x_t,y_train))

# pickle.dump(NB, open('models/NB.pkl','wb'))

Training Accuracy:  0.5203637436824436


In [25]:
pickle.dump(NB, open('models/NB.pkl','wb'))

In [13]:
print(classification_report(y_train,NB.predict(x_t)))

              precision    recall  f1-score   support

      comedy       0.69      0.00      0.00      7447
 documentary       0.63      0.72      0.67     13096
       drama       0.52      0.64      0.57     13613
       other       0.45      0.67      0.54     14985
       short       0.00      0.00      0.00      5073

    accuracy                           0.52     54214
   macro avg       0.46      0.41      0.36     54214
weighted avg       0.50      0.52      0.46     54214



## SVM

### Sigmoid

In [5]:
svm_sig = SVC(C=1.0, kernel='sigmoid', degree=3, gamma='auto')
svm_sig.fit(x_train,y_train)

print("Training Accuracy: ",svm_sig.score(x_train,y_train))

Training Accuracy:  0.5411148411849338


In [7]:
pickle.dump(svm_sig, open('models/SVM_sig.pkl','wb'))

In [8]:
print(classification_report(y_train,svm_sig.predict(x_train)))

              precision    recall  f1-score   support

      comedy       1.00      0.00      0.00      7447
 documentary       0.62      0.79      0.70     13096
       drama       0.51      0.70      0.59     13613
       other       0.50      0.62      0.56     14985
       short       0.00      0.00      0.00      5073

    accuracy                           0.54     54214
   macro avg       0.53      0.42      0.37     54214
weighted avg       0.55      0.54      0.47     54214



### RBF

In [12]:
svm_rbf = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
svm_rbf.fit(x_train,y_train)

print("Training Accuracy: ",svm_rbf.score(x_train,y_train))

Training Accuracy:  0.5550595787066072


In [14]:
pickle.dump(svm_rbf, open('models/SVM_rbf.pkl','wb'))

In [13]:
print(classification_report(y_train,svm_rbf.predict(x_train)))

              precision    recall  f1-score   support

      comedy       0.74      0.04      0.07      7447
 documentary       0.63      0.81      0.71     13096
       drama       0.51      0.72      0.60     13613
       other       0.53      0.63      0.57     14985
       short       0.00      0.00      0.00      5073

    accuracy                           0.56     54214
   macro avg       0.48      0.44      0.39     54214
weighted avg       0.53      0.56      0.49     54214



## Logistic Regression

In [18]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

print("Training Accuracy: ",lr.score(x_train,y_train))

Training Accuracy:  0.6318478621758218


In [20]:
pickle.dump(lr, open('models/LR.pkl','wb'))

In [21]:
print(classification_report(y_train,lr.predict(x_train)))

              precision    recall  f1-score   support

      comedy       0.60      0.50      0.54      7447
 documentary       0.71      0.79      0.75     13096
       drama       0.60      0.68      0.63     13613
       other       0.62      0.64      0.63     14985
       short       0.55      0.28      0.37      5073

    accuracy                           0.63     54214
   macro avg       0.61      0.58      0.59     54214
weighted avg       0.63      0.63      0.62     54214

