In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
moviesData = pd.read_csv('CleanedDatasetForML.csv')

In [3]:
moviesData.head(10)

Unnamed: 0.1,Unnamed: 0,Title,Selected_Genre,Clean_Plot
0,6,The Great Train Robbery,action,the film opens with two bandits breaking into ...
1,7,The Suburbanite,comedy,the film is about a family who move to the sub...
2,13,Daniel Boone,biography,boone daughter befriends an indian maiden as ...
3,14,How Brown Saw the Baseball Game,comedy,before heading out to a baseball game at a nea...
4,15,Laughing Gas,comedy,the plot is that of a black woman going to the...
5,16,The Adventures of Dollie,drama,on a beautiful summer day a father and mother ...
6,17,The Black Viper,drama,a thug accosts a girl as she leaves her workpl...
7,18,A Calamitous Elopement,comedy,a young couple decides to elope after being ca...
8,19,The Call of the Wild,action,a white girl (florence lawrence) rejects a pro...
9,20,A Christmas Carol,drama,no prints of the first american film adaptatio...


In [4]:
moviesData['Split_Genre']=moviesData['Selected_Genre'].str.split('|')
moviesData['Split_Genre']= moviesData['Split_Genre'].apply(np.sort).apply(np.unique)

In [5]:
moviesData.head(50)

Unnamed: 0.1,Unnamed: 0,Title,Selected_Genre,Clean_Plot,Split_Genre
0,6,The Great Train Robbery,action,the film opens with two bandits breaking into ...,[action]
1,7,The Suburbanite,comedy,the film is about a family who move to the sub...,[comedy]
2,13,Daniel Boone,biography,boone daughter befriends an indian maiden as ...,[biography]
3,14,How Brown Saw the Baseball Game,comedy,before heading out to a baseball game at a nea...,[comedy]
4,15,Laughing Gas,comedy,the plot is that of a black woman going to the...,[comedy]
5,16,The Adventures of Dollie,drama,on a beautiful summer day a father and mother ...,[drama]
6,17,The Black Viper,drama,a thug accosts a girl as she leaves her workpl...,[drama]
7,18,A Calamitous Elopement,comedy,a young couple decides to elope after being ca...,[comedy]
8,19,The Call of the Wild,action,a white girl (florence lawrence) rejects a pro...,[action]
9,20,A Christmas Carol,drama,no prints of the first american film adaptatio...,[drama]


In [6]:
moviesData = pd.concat([moviesData, moviesData.Split_Genre.apply(lambda x: '-'.join(x)).str.get_dummies(sep='-')], axis=1)
trainData, testData = train_test_split(moviesData, random_state=0, test_size=0.20, shuffle=True)


In [7]:
tfidf = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')

In [8]:
x_train = tfidf.fit_transform(trainData.Clean_Plot) 
x_test  = tfidf.transform(testData.Clean_Plot)

In [9]:
y_train = trainData[trainData.columns[4:]]
y_test = testData[testData.columns[4:]]


In [10]:
y_train.head(50)

Unnamed: 0,Split_Genre,action,animation,biography,black,children,comedy,crime,documentary,drama,fantasy,fiction,history,musical,mystery,romance,science,series,thriller
19824,[drama],0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
16969,[comedy],0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
23041,[action],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
865,[action],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23297,[drama],0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4109,[drama],0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
11468,[drama],0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3199,[action],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10784,[action],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19407,[drama],0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [11]:
categories = np.array(y_train.columns[1:])

In [12]:
categories

array(['action', 'animation', 'biography', 'black', 'children', 'comedy',
       'crime', 'documentary', 'drama', 'fantasy', 'fiction', 'history',
       'musical', 'mystery', 'romance', 'science', 'series', 'thriller'],
      dtype=object)

In [13]:
model = MultinomialNB(fit_prior=True, class_prior=None)
naiveBayes = pd.DataFrame(columns=['Genre', 'accuracy'])

In [14]:
i = 0
for genre in categories:
    model.fit(x_train, y_train[genre])
    pred = model.predict(x_test)
    print("GENRE: " + genre)
    print('')
    print('cross validation....')
    scores = cross_val_score(model, x_train, y_train[genre], cv=5)
    print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print('')
    print('confusion matrix')
    print(confusion_matrix(y_test[genre], pred))
    print('')
    print(classification_report(y_test[genre], pred))
    naiveBayes.loc[i,'Genre'] = genre
    naiveBayes.loc[i,'accuracy'] = accuracy_score(y_test[genre], pred)
    i=i+1

GENRE: action

cross validation....
[0.78521653 0.79263132 0.79337814 0.78366941 0.79213343]
Accuracy: 0.79 (+/- 0.01)

confusion matrix
[[3822  407]
 [ 292  501]]

              precision    recall  f1-score   support

           0       0.93      0.90      0.92      4229
           1       0.55      0.63      0.59       793

    accuracy                           0.86      5022
   macro avg       0.74      0.77      0.75      5022
weighted avg       0.87      0.86      0.86      5022

GENRE: animation

cross validation....
[0.92956695 0.92730894 0.92631317 0.93203883 0.93029624]
Accuracy: 0.93 (+/- 0.00)

confusion matrix
[[4822   47]
 [  62   91]]

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4869
           1       0.66      0.59      0.63       153

    accuracy                           0.98      5022
   macro avg       0.82      0.79      0.81      5022
weighted avg       0.98      0.98      0.98      5022

GENRE: biogra

In [15]:
naiveBayes.to_csv('NaiveBayesModelAccuracy.csv',sep=',')

In [16]:
naiveBayes

Unnamed: 0,Genre,accuracy
0,action,0.860812
1,animation,0.978295
2,biography,0.976703
3,black,0.968339
4,children,0.971724
5,comedy,0.752489
6,crime,0.92035
7,documentary,0.993429
8,drama,0.705894
9,fantasy,0.985066
