In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Read Data

In [3]:
path = '../Final Project/'

In [4]:
dataTraining = pd.read_csv(os.path.join(path, 'data', 'dataTraining.csv'), encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv(os.path.join(path, 'data', 'dataTesting.csv'), encoding='UTF-8', index_col=0)

In [5]:
dataTesting.head()

Unnamed: 0,year,title,plot
1,1999,Message in a Bottle,"who meets by fate , shall be sealed by fate ...."
4,1978,Midnight Express,"the true story of billy hayes , an american c..."
5,1996,Primal Fear,martin vail left the chicago da ' s office to ...
6,1950,Crisis,husband and wife americans dr . eugene and mr...
7,1959,The Tingler,the coroner and scientist dr . warren chapin ...


In [6]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [7]:
vect = CountVectorizer(ngram_range=(1, 2), max_features=2000)
X_dtm = vect.fit_transform(dataTraining['plot'])
X_dtm.shape

(7895, 2000)

In [8]:
X_dtm

<7895x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 569377 stored elements in Compressed Sparse Row format>

In [9]:
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))

le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])

In [10]:
y_genres

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ..., 
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])

In [11]:
y_genres.shape

(7895, 24)

In [12]:
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.3, random_state=42)

In [13]:
X_train.shape

(5526, 2000)

In [14]:
X_test.shape

(2369, 2000)

In [15]:
y_train_genres.shape

(5526, 24)

In [16]:
y_test_genres.shape

(2369, 24)

In [17]:
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Conv1D, GlobalMaxPooling1D
from keras.optimizers import RMSprop
from keras.callbacks import History
from livelossplot import PlotLossesKeras

Using TensorFlow backend.


In [18]:
model = Sequential()

model.add(Dense(2100, input_shape=(2000,)))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(24))
model.add(Activation('sigmoid'))

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2100)              4202100   
_________________________________________________________________
activation_1 (Activation)    (None, 2100)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 2100)              8400      
_________________________________________________________________
dropout_1 (Dropout)          (None, 2100)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                50424     
_________________________________________________________________
activation_2 (Activation)    (None, 24)                0         
Total params: 4,260,924
Trainable params: 4,256,724
Non-trainable params: 4,200
______________________________________________________________

In [20]:
model.compile("rmsprop", "categorical_crossentropy", metrics=["accuracy"])

In [21]:
model.fit(X_dtm,y_genres,epochs=50,verbose=2)

Epoch 1/50
 - 18s - loss: 7.5484 - acc: 0.2288
Epoch 2/50
 - 18s - loss: 6.1661 - acc: 0.3207
Epoch 3/50
 - 18s - loss: 5.8053 - acc: 0.3364
Epoch 4/50
 - 19s - loss: 5.5529 - acc: 0.3411
Epoch 5/50
 - 18s - loss: 5.3635 - acc: 0.3574
Epoch 6/50
 - 18s - loss: 5.2370 - acc: 0.3559
Epoch 7/50
 - 18s - loss: 5.1148 - acc: 0.3750
Epoch 8/50
 - 18s - loss: 5.0284 - acc: 0.3777
Epoch 9/50
 - 17s - loss: 5.0088 - acc: 0.3828
Epoch 10/50
 - 19s - loss: 4.9144 - acc: 0.3904
Epoch 11/50
 - 17s - loss: 4.8136 - acc: 0.4023
Epoch 12/50
 - 17s - loss: 4.7563 - acc: 0.3962
Epoch 13/50
 - 17s - loss: 4.6943 - acc: 0.4039
Epoch 14/50
 - 17s - loss: 4.6423 - acc: 0.4139
Epoch 15/50
 - 17s - loss: 4.5955 - acc: 0.4130
Epoch 16/50
 - 17s - loss: 4.5809 - acc: 0.4181
Epoch 17/50
 - 18s - loss: 4.5285 - acc: 0.4231
Epoch 18/50
 - 18s - loss: 4.5181 - acc: 0.4385
Epoch 19/50
 - 18s - loss: 4.4663 - acc: 0.4374
Epoch 20/50
 - 19s - loss: 4.4702 - acc: 0.4460
Epoch 21/50
 - 18s - loss: 4.4236 - acc: 0.4402
E

<keras.callbacks.History at 0x1a214e84a8>

In [22]:
# make predictions for testing set
y_pred = model.predict_classes(X_train)
y_pred

array([4, 7, 7, ..., 4, 7, 7])

In [23]:
y_train_one_c=y_train_genres.argmax(axis=1)

In [24]:
print('The accuracy of the nnet model is ',(y_pred==y_train_one_c).mean())

The accuracy of the nnet model is  0.49457111835


In [25]:
# make predictions for testing set
y_pred_test = model.predict_classes(X_test)
y_pred_test

array([7, 4, 7, ..., 0, 4, 4])

In [26]:
y_test_one_c=y_test_genres.argmax(axis=1)

In [27]:
print('The accuracy of the nnet model is ',(y_pred_test==y_test_one_c).mean())

The accuracy of the nnet model is  0.497678345293


In [30]:
X_test_dtm = vect.transform(dataTesting['plot'])

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

y_pred_test_genres = model.predict_proba(X_test_dtm)

pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols).to_csv('pred_genres_text_CNN.csv', index_label='ID')