In [1]:
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# read files
train_features = pd.read_csv('./data/train_features.tsv',sep='\t')
train_labels = pd.read_csv('./data/train_labels.tsv',sep='\t')
valid_features = pd.read_csv('./data/valid_features.tsv',sep='\t')
valid_labels = pd.read_csv('./data/valid_labels.tsv',sep='\t')
test_features = pd.read_csv('./data/test_features.tsv',sep='\t')

In [3]:
len(train_features),len(valid_features),len(test_features)

(5240, 299, 298)

In [4]:
train_labels.genres.unique()

array(['Thriller', 'Romance', 'Crime', 'Comedy', 'Musical', 'Documentary',
       'Drama', 'Adventure', 'War', 'Horror', 'Children', 'Film_Noir',
       'Sci_Fi', 'Mystery', 'Fantasy', 'Action', 'Western', 'Animation'],
      dtype=object)

In [5]:
# perform one-hot encoding to all the tags
tags = train_features.tag.str.split(',',expand=True).stack().unique()

In [6]:
# 1 if contains tags, o otherwise
for tag in tags:
    train_features[tag]=train_features.tag.apply(lambda x:1 if tag in x.split(',') else 0)
    valid_features[tag]=valid_features.tag.apply(lambda x:1 if tag in x.split(',') else 0)    
    test_features[tag]=test_features.tag.apply(lambda x:1 if tag in x.split(',') else 0)    

In [7]:
# extract features and transform to numpy array
# avf1-avf107 107
# ivec1-ivec20 20
# tags 200
# total 327 features
train_X = train_features.iloc[:,5:].to_numpy()
train_y = train_labels.genres.to_numpy()
valid_X = valid_features.iloc[:,5:].to_numpy()
valid_y = valid_labels.genres.to_numpy()
test_X = test_features.iloc[:,5:].to_numpy()
train_X.shape

(5240, 327)

In [8]:
# random forest
clf1 = RandomForestClassifier(max_features=100,min_samples_leaf=20)
clf1.fit(train_X,train_y)
train_y_hat = clf1.predict(train_X)
print("train acc: ",accuracy_score(train_y,train_y_hat))
valid_y_hat = clf1.predict(valid_X)
print("valid acc: ",accuracy_score(valid_y,valid_y_hat))
test_y = clf1.predict(test_X)
test_labels = pd.DataFrame()
test_labels['movieId']=test_features.movieId
test_labels['genres']=test_y
test_labels.to_csv('./rf.csv',index=False,sep='\t')

train acc:  0.5318702290076336
valid acc:  0.3377926421404682


In [9]:
#nave bayes
clf2 = BernoulliNB()
clf2.fit(train_X,train_y)
train_y_hat = clf2.predict(train_X)
print("train acc: ",accuracy_score(train_y,train_y_hat))
valid_y_hat = clf2.predict(valid_X)
print("valid acc: ",accuracy_score(valid_y,valid_y_hat))
test_y = clf1.predict(test_X)
test_labels = pd.DataFrame()
test_labels['movieId']=test_features.movieId
test_labels['genres']=test_y
test_labels.to_csv('./nb.csv',index=False,sep='\t')

train acc:  0.36393129770992366
valid acc:  0.3311036789297659


In [10]:
# decision tree
clf3 = DecisionTreeClassifier(min_samples_split=210)
clf3.fit(train_X,train_y)
train_y_hat = clf3.predict(train_X)
print("train acc: ",accuracy_score(train_y,train_y_hat))
valid_y_hat = clf3.predict(valid_X)
print("valid acc: ",accuracy_score(valid_y,valid_y_hat))
test_y = clf1.predict(test_X)
test_labels = pd.DataFrame()
test_labels['movieId']=test_features.movieId
test_labels['genres']=test_y
test_labels.to_csv('./dt.csv',index=False,sep='\t')

train acc:  0.3761450381679389
valid acc:  0.3612040133779264
