In [1]:
import pandas as pd
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
)
# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import classification_report, f1_score

from os import cpu_count

In [2]:
def split_feature_label(df, label_col):
    return df.drop(columns=label_col), df[label_col]

In [3]:
def test_lr_on(df_train, df_test, feature_col):
    X_train, y_train = split_feature_label(df_train, feature_col)
    X_test, y_test = split_feature_label(df_test, feature_col)
    lr = LogisticRegression(random_state=42, C=0.69, penalty='l1')
    lr.fit(X_train, y_train)
    return lr.score(X_test, y_test)

In [4]:
def test_rf_on(df_train, df_test, feature_col):
    X_train, y_train = split_feature_label(df_train, feature_col)
    X_test, y_test = split_feature_label(df_test, feature_col)
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)
    return rf.score(X_test, y_test)

In [14]:
df_50_train = pd.read_csv("train_compressed_to_50_features.csv", index_col='id_of_the_film')
df_50_test = pd.read_csv("test_compressed_to_50_features.csv", index_col='id_of_the_film')
df_100_train = pd.read_csv("train_compressed_to_100_features.csv", index_col='id_of_the_film')
df_100_test = pd.read_csv("test_compressed_to_100_features.csv", index_col='id_of_the_film')
df_500_train = pd.read_csv("train_compressed_to_500_features.csv", index_col='id_of_the_film')
df_500_test = pd.read_csv("test_compressed_to_500_features.csv", index_col='id_of_the_film')

In [18]:
print(test_lr_on(df_50_train, df_50_test, 'genre_of_the_film'))
print(test_lr_on(df_100_train, df_100_test, 'genre_of_the_film'))
print(test_lr_on(df_500_train, df_500_test, 'genre_of_the_film'))



0.08214285714285714
0.08452380952380953




0.10833333333333334


In [19]:
print(test_rf_on(df_50_train, df_50_test, 'genre_of_the_film'))
print(test_rf_on(df_100_train, df_100_test, 'genre_of_the_film'))
print(test_rf_on(df_500_train, df_500_test, 'genre_of_the_film'))

0.09404761904761905
0.10833333333333334
0.0880952380952381


In [9]:
df_svd_50_train = pd.read_csv("svd/movie_train_bag_of_words_with_svd_50.csv", index_col='Unnamed: 0')
df_svd_50_test = pd.read_csv("svd/movie_test_bag_of_words_with_svd_50.csv", index_col='Unnamed: 0')
df_svd_100_train = pd.read_csv("svd/movie_train_bag_of_words_with_svd_100.csv", index_col='Unnamed: 0')
df_svd_100_test = pd.read_csv("svd/movie_test_bag_of_words_with_svd_100.csv", index_col='Unnamed: 0')
df_svd_500_train = pd.read_csv("svd/movie_train_bag_of_words_with_svd_500.csv", index_col='Unnamed: 0')
df_svd_500_test = pd.read_csv("svd/movie_test_bag_of_words_with_svd_500.csv", index_col='Unnamed: 0')

In [10]:
label = 'Label'

In [11]:
print(test_lr_on(df_svd_50_train, df_svd_50_test, label))
print(test_lr_on(df_svd_100_train, df_svd_100_test, label))
print(test_lr_on(df_svd_500_train, df_svd_500_test, label))



0.2357142857142857
0.2369047619047619




0.2369047619047619


In [12]:
print(test_rf_on(df_svd_50_train, df_svd_50_test, label))
print(test_rf_on(df_svd_100_train, df_svd_100_test, label))
print(test_rf_on(df_svd_500_train, df_svd_500_test, label))

0.3547619047619048
0.3607142857142857
0.35833333333333334


In [13]:
df_train = pd.read_csv("movies_for_svd_train.csv", index_col='id_of_the_film')
df_test = pd.read_csv("movies_for_svd_test.csv", index_col='id_of_the_film')

In [17]:
%%time
print("RF:" + str(test_rf_on(df_train, df_test, 'genre_of_the_film')))
print("LR:" + str(test_lr_on(df_train, df_test, 'genre_of_the_film')))

RF:0.4035714285714286




LR:0.40714285714285714
Wall time: 9.5 s


In [16]:
df_train.head()

Unnamed: 0_level_0,00,000,000th,003,01,02,03,033,04,047,...,綿谷,賢治,青木ヶ原,나그네,달수,애꾸눈,잎싹,초록,초록머리,genre_of_the_film
id_of_the_film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15387,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,western
16801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fantasy
28796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,drama
570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fantasy
14382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,horror
