#### Name: Wentao Hao
#### Student ID: 1096215
#### Username: haowh

In [1]:
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [2]:
# read in all the files
train_features = pd.read_csv('./data/train_features.tsv', sep = '\t')
train_labels = pd.read_csv('./data/train_labels.tsv', sep = '\t')
valid_features = pd.read_csv('./data/valid_features.tsv', sep = '\t')
valid_labels = pd.read_csv('./data/valid_labels.tsv', sep = '\t')
test_features = pd.read_csv('./data/test_features.tsv', sep = '\t')

In [3]:
len(train_features), len(valid_features), len(test_features)

(5240, 299, 235)

In [4]:
train_labels.genres.unique()

array(['Thriller', 'Romance', 'Crime', 'Comedy', 'Musical', 'Documentary',
       'Drama', 'Adventure', 'War', 'Horror', 'Children', 'Film_Noir',
       'Sci_Fi', 'Mystery', 'Fantasy', 'Action', 'Western', 'Animation'],
      dtype=object)

In [5]:
# perform one-hot encoding to all the tags, 1 if contains tags, 0 otherwise
tags = train_features.tag.str.split(',', expand = True).stack().unique()
for tag in tags:
    train_features[tag] = train_features.tag.apply(lambda x : 1 if tag in x.split(',') else 0)
    valid_features[tag] = valid_features.tag.apply(lambda x : 1 if tag in x.split(',') else 0)    
    test_features[tag] = test_features.tag.apply(lambda x : 1 if tag in x.split(',') else 0) 

In [6]:
# extract features and transform to numpy array
# avf1-avf107 10
# ivec1-ivec20 20
# tags 200
# total 327 features
train_X = train_features.iloc[:, 5:].to_numpy()
train_y = train_labels.genres.to_numpy()
valid_X = valid_features.iloc[:, 5:].to_numpy()
valid_y = valid_labels.genres.to_numpy()
test_X = test_features.iloc[:, 5:].to_numpy()
train_X.shape

(5240, 327)

In [7]:
# random forest
clf1 = RandomForestClassifier(max_features = 100, min_samples_leaf = 20)
clf1.fit(train_X, train_y)
train_y_hat = clf1.predict(train_X)
print("train accuracy: ", accuracy_score(train_y, train_y_hat))
valid_y_hat = clf1.predict(valid_X)
print("valid accuracy: ", accuracy_score(valid_y, valid_y_hat))
print(classification_report(valid_y, valid_y_hat, zero_division = 1))
test_y = clf1.predict(test_X)
test_labels = pd.DataFrame()
test_labels['movieId'] = test_features.movieId
test_labels['genres'] = test_y
test_labels.to_csv('./random-forest-labels.csv', index = False)

train accuracy:  0.533969465648855
valid accuracy:  0.3311036789297659
              precision    recall  f1-score   support

      Action       1.00      0.00      0.00         6
   Adventure       1.00      0.00      0.00         2
   Animation       1.00      0.00      0.00         3
    Children       1.00      0.00      0.00         3
      Comedy       0.36      0.55      0.44        38
       Crime       1.00      0.00      0.00         5
 Documentary       0.71      0.28      0.40        18
       Drama       0.23      0.51      0.32        43
     Fantasy       1.00      0.22      0.36        18
   Film_Noir       1.00      0.00      0.00         4
      Horror       1.00      0.00      0.00         8
     Musical       0.00      0.00      0.00        10
     Mystery       1.00      0.00      0.00        18
     Romance       0.33      0.27      0.30        51
      Sci_Fi       0.65      0.69      0.67        16
    Thriller       0.25      0.57      0.34        28
         W

In [8]:
# naive bayes
clf2 = BernoulliNB()
clf2.fit(train_X, train_y)
train_y_hat = clf2.predict(train_X)
print("train acc: ", accuracy_score(train_y, train_y_hat))
valid_y_hat = clf2.predict(valid_X)
print("valid acc: ", accuracy_score(valid_y, valid_y_hat))
print(classification_report(valid_y, valid_y_hat, zero_division = 1))
test_y = clf2.predict(test_X)
test_labels = pd.DataFrame()
test_labels['movieId'] = test_features.movieId
test_labels['genres'] = test_y
test_labels.to_csv('./naive-bayes-lebels.csv', index = False)

train acc:  0.36393129770992366
valid acc:  0.3311036789297659
              precision    recall  f1-score   support

      Action       0.00      0.00      0.00         6
   Adventure       1.00      0.00      0.00         2
   Animation       0.00      0.00      0.00         3
    Children       0.20      0.33      0.25         3
      Comedy       0.30      0.76      0.43        38
       Crime       0.00      0.00      0.00         5
 Documentary       0.38      0.78      0.51        18
       Drama       0.29      0.26      0.27        43
     Fantasy       0.47      0.50      0.49        18
   Film_Noir       0.12      0.25      0.17         4
      Horror       0.00      0.00      0.00         8
     Musical       1.00      0.10      0.18        10
     Mystery       1.00      0.06      0.11        18
     Romance       0.62      0.16      0.25        51
      Sci_Fi       0.67      0.38      0.48        16
    Thriller       0.28      0.57      0.37        28
         War      

In [9]:
# neural network
clf4 = MLPClassifier(hidden_layer_sizes=(80, 80, 80), activation ='tanh', solver = 'adam', max_iter = 500)
clf4.fit(train_X, train_y)
train_y_hat = clf4.predict(train_X)
print("train acc: ", accuracy_score(train_y, train_y_hat))
valid_y_hat = clf4.predict(valid_X)
print("valid acc: ", accuracy_score(valid_y, valid_y_hat))
print(classification_report(valid_y, valid_y_hat, zero_division = 1))
test_y = clf4.predict(test_X)
test_labels = pd.DataFrame()
test_labels['movieId'] = test_features.movieId
test_labels['genres'] = test_y
test_labels.to_csv('./neural-network-labels.csv', index = False)

train acc:  0.9230916030534351
valid acc:  0.31438127090301005
              precision    recall  f1-score   support

      Action       0.17      0.17      0.17         6
   Adventure       0.00      0.00      0.00         2
   Animation       0.50      0.33      0.40         3
    Children       0.25      0.67      0.36         3
      Comedy       0.34      0.32      0.33        38
       Crime       0.14      0.60      0.23         5
 Documentary       0.53      0.56      0.54        18
       Drama       0.33      0.33      0.33        43
     Fantasy       0.33      0.17      0.22        18
   Film_Noir       0.25      0.25      0.25         4
      Horror       0.00      0.00      0.00         8
     Musical       0.14      0.20      0.17        10
     Mystery       0.15      0.11      0.13        18
     Romance       0.38      0.37      0.38        51
      Sci_Fi       0.38      0.38      0.38        16
    Thriller       0.44      0.43      0.44        28
         War      