In [39]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import cv2
import os
from nltk import wordpunct_tokenize
import re
import nn
import matplotlib.pyplot as plt
import tensorflow as tf
from keras import backend as K
from tensorflow.keras import layers, models


# Metric

In [93]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [40]:
users = pd.read_csv('./dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('./dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('./dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('./dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')                         
movies_train['genre'] = movies_train.genre.str.split('|')
movies_test['genre'] = movies_test.genre.str.split('|')

users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')


In [3]:
users

Unnamed: 0_level_0,gender,age,occupation,zip
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,02460
5,M,25,20,55455
...,...,...,...,...
6036,F,25,15,32603
6037,F,45,1,76006
6038,F,56,1,14706
6039,F,45,0,01060


In [4]:
ratings



Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
movies_train


Unnamed: 0_level_0,title,genre
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
1650,Washington Square (1997),[Drama]
185,"Net, The (1995)","[Sci-Fi, Thriller]"
1377,Batman Returns (1992),"[Action, Adventure, Comedy, Crime]"
3204,"Boys from Brazil, The (1978)",[Thriller]
1901,Dear Jesse (1997),[Documentary]
...,...,...
2539,Analyze This (1999),[Comedy]
3038,"Face in the Crowd, A (1957)",[Drama]
1832,Heaven's Burning (1997),"[Action, Drama]"
657,Yankee Zulu (1994),"[Comedy, Drama]"


In [41]:
df_1 = pd.merge(movies_train,ratings,how='inner',on='movieid')
df_2 = pd.merge(movies_test,ratings,how='inner',on='movieid')

df_1.shape

(817424, 6)

In [72]:
ratings[ratings["movieid"]==794]

Unnamed: 0,userid,movieid,rating,timestamp


In [42]:
data1 = pd.merge(df_1,users,how='inner',on='userid')

In [58]:
data1

Unnamed: 0,movieid,title,genre,userid,rating,timestamp,gender,age,occupation,zip
0,1650,Washington Square (1997),[Drama],5,3,978245314,1,25,20,55455
1,3163,Topsy-Turvy (1999),[Drama],5,5,978244852,1,25,20,55455
2,593,"Silence of the Lambs, The (1991)","[Drama, Thriller]",5,4,978244177,1,25,20,55455
3,3046,Incredibly True Adventure of Two Girls in Love...,"[Comedy, Romance]",5,3,978244962,1,25,20,55455
4,1485,Liar Liar (1997),[Comedy],5,3,978246576,1,25,20,55455
...,...,...,...,...,...,...,...,...,...,...
817419,2841,Stir of Echoes (1999),[Thriller],5967,4,956968849,1,50,16,73069-5429
817420,2908,Boys Don't Cry (1999),[Drama],5967,5,956968771,1,50,16,73069-5429
817421,2628,Star Wars: Episode I - The Phantom Menace (1999),"[Action, Adventure, Fantasy, Sci-Fi]",5967,3,956968880,1,50,16,73069-5429
817422,1499,Anaconda (1997),"[Action, Adventure, Thriller]",5967,4,956968663,1,50,16,73069-5429


# Test

In [56]:
dataMapping={
    "M":1,
    "F":2
}

In [57]:
data2 = pd.merge(df_2,users,how='inner',on='userid')

In [58]:
data2["gender"]=data2["gender"].map(dataMapping)

# Train

In [59]:
from sklearn.neighbors import KNeighborsClassifier,KDTree

In [60]:
data1["gender"]=data1["gender"].map(dataMapping)

In [61]:
x_train=data1.drop_duplicates(subset="userid",ignore_index=True)

In [62]:
x_train=x_train.drop(["movieid","title","timestamp","zip"],axis=1)

In [63]:
list_train=["rating","gender","age"]
x_train[list_train]

Unnamed: 0,rating,gender,age
0,3,1,25
1,1,1,1
2,5,1,56
3,4,1,18
4,4,2,45
...,...,...,...
6035,3,2,35
6036,3,1,18
6037,5,1,18
6038,1,1,25


In [57]:
x_train

Unnamed: 0,genre,userid,rating,gender,age,occupation
0,[Drama],5,3,1,25,20
1,[Drama],19,1,1,1,10
2,[Drama],124,5,1,56,7
3,[Drama],136,4,1,18,2
4,[Drama],187,4,2,45,1
...,...,...,...,...,...,...
6035,[Horror],642,3,2,35,1
6036,[Comedy],5919,3,1,18,0
6037,"[Action, Adventure]",311,5,1,18,4
6038,"[Horror, Thriller]",5828,1,1,25,17


In [64]:
data2

Unnamed: 0,movieid,title,genre,userid,rating,timestamp,gender,age,occupation,zip
0,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",18,5,978155207,2,18,3,95825
1,2989,For Your Eyes Only (1981),[Action],18,5,978153344,2,18,3,95825
2,3418,Thelma & Louise (1991),"[Action, Drama]",18,1,978153104,2,18,3,95825
3,1246,Dead Poets Society (1989),[Drama],18,5,978156549,2,18,3,95825
4,485,Last Action Hero (1993),"[Action, Comedy]",18,2,978153854,2,18,3,95825
...,...,...,...,...,...,...,...,...,...,...
182780,1088,Dirty Dancing (1987),"[Musical, Romance]",4803,5,962937234,2,25,16,45750
182781,898,"Philadelphia Story, The (1940)","[Comedy, Romance]",4752,5,963182886,2,18,4,94121
182782,1066,Shall We Dance? (1937),"[Comedy, Musical, Romance]",4752,4,963182908,2,18,4,94121
182783,648,Mission: Impossible (1996),"[Action, Adventure, Mystery]",898,5,975216799,1,25,7,77005


In [65]:
with open('./dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
    genre_all = [x.replace('\n','') for x in genre_all]
genre2idx = {genre:idx for idx, genre in enumerate(genre_all)}

In [66]:
data2_unique=movies_test.copy()
# data2[data2["movieid"]==data2_unique["movieid"].iloc[:1]]
data2_unique.reset_index(inplace=True)
# data2_each=data2[data2["movieid"]==int(data2_unique["movieid"].iloc[0])]

In [67]:
data1_unique=movies_train.copy()
# data2[data2["movieid"]==data2_unique["movieid"].iloc[:1]]
data1_unique.reset_index(inplace=True)
# data1_each=data1[data1["movieid"]==int(data1_unique["movieid"].iloc[0])]

In [68]:
data1_unique.reset_index(inplace=True)
vectors_genre=[]
for genre in data1_unique.genre.tolist():
    genre_vector = np.zeros(len(genre2idx))
    for g in genre:
        genre_vector[genre2idx[g]] = 1
    vectors_genre.append(genre_vector)
data1_unique['genre_vectors']=vectors_genre


In [69]:
vectors_genre=[]
for genre in data2_unique.genre.tolist():
    genre_vector = np.zeros(len(genre2idx))
    for g in genre:
        genre_vector[genre2idx[g]] = 1
    vectors_genre.append(genre_vector)
data2_unique['genre_vectors']=vectors_genre


# Train

In [70]:
vectors_genre1=[]
vectors_labels=[]
tree = KDTree(x_train[list_train], leaf_size=2) 
for i in range(len(data1_unique)):
    data1_each=data1[data1["movieid"]==int(data1_unique["movieid"].iloc[i])]
    data1_each=data1_each[(data1_each["rating"]==5) | (data1_each["rating"]==4) ]
    if len(data1_each)!=0:##get movies having rating
        dist, ind_train = tree.query(data1_each[list_train], k=5)
        x_train_new1=x_train.iloc[np.ravel(ind_train)]
        x_train_new_unique1=x_train_new1.drop_duplicates(subset="userid",ignore_index=True)
        x_train_final1=data1[data1["userid"].isin(x_train_new_unique1["userid"])] 
        x_train_final_high1=x_train_final1[(x_train_final1["rating"]==5) | (x_train_final1["rating"]==4)  ]
        genre_vector1 = np.zeros(len(genre2idx))
        for genre in x_train_final_high1.genre.tolist():
            for g in genre:
                genre_vector1[genre2idx[g]] += 1
        vectors_genre1.append(genre_vector1.astype(int))
        vectors_labels.append(data1_unique["genre_vectors"].iloc[i].tolist())

In [71]:
vectors_labels=np.array(vectors_labels)

In [72]:
vectors_genre_table=pd.DataFrame(vectors_genre1)

In [73]:
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler,MaxAbsScaler,PowerTransformer,QuantileTransformer
scaler=MinMaxScaler()
vectors_train=scaler.fit_transform(vectors_genre_table)
    

In [74]:
vectors_labels=pd.DataFrame(vectors_labels)

In [75]:
vectors_labels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2815,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [121]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from imblearn.over_sampling import BorderlineSMOTE

In [255]:
model_rating=[]
hype_param_best=[]
for label in range(vectors_labels.shape[1]):
    logreg = LogisticRegression(C=0.5)
    smote=BorderlineSMOTE(random_state=27,k_neighbors=5)
    smote_x_train,smote_y_train=smote.fit_resample(vectors_train,vectors_labels[label])
    print('... Processing {}'.format(label))
    # train the model using X_dtm & y
    logreg.fit(smote_x_train, smote_y_train)
    model_rating.append(logreg)
    # compute the training accuracy
    y_pred_X = logreg.predict_proba(smote_x_train)[:,1]
    y_pred_X1 = logreg.predict(smote_x_train)
    print('Training accuracy(before) is {}'.format(f1_score(smote_y_train, y_pred_X1)))
    hype_param=0
    interval=0.01
    maxScore=0
    for i in range(990):
        interval+=0.001
        y_pred_new=(y_pred_X>=interval).astype(int)
        temp_score=f1_score(smote_y_train,y_pred_new)
        if temp_score>maxScore:
            maxScore=temp_score
            hype_param=interval

    print(f"{label}: maxScore: {maxScore} best T:{hype_param}")

    hype_param_best.append(hype_param)

    print('Training accuracy(after) is {}'.format(f1_score(smote_y_train, (y_pred_X>=hype_param).astype(int))))

... Processing 0
Training accuracy(before) is 0.648598820058997
0: maxScore: 0.6911290322580645 best T:0.3930000000000003
Training accuracy(after) is 0.6911290322580645
... Processing 1
Training accuracy(before) is 0.6441933691424339
1: maxScore: 0.6727688787185355 best T:0.34600000000000025
Training accuracy(after) is 0.6727688787185355
... Processing 2
Training accuracy(before) is 0.7375690607734807
2: maxScore: 0.7494363929146538 best T:0.4280000000000003
Training accuracy(after) is 0.7494363929146538
... Processing 3
Training accuracy(before) is 0.7760057992026095
3: maxScore: 0.7809319764327799 best T:0.4860000000000004
Training accuracy(after) is 0.7809319764327799
... Processing 4
Training accuracy(before) is 0.7399656946826758
4: maxScore: 0.7406137493571061 best T:0.4990000000000004
Training accuracy(after) is 0.7406137493571061
... Processing 5
Training accuracy(before) is 0.5691226243165842
5: maxScore: 0.6724351050679852 best T:0.3800000000000003
Training accuracy(after) is

In [306]:
submission_rating_train=pd.DataFrame()
for i in range(vectors_labels.shape[1]):
    y_pred_X = model_rating[i].predict_proba(vectors_train)[:,1]
    submission_rating_train[i]=y_pred_X
    print('Training accuracy is {}'.format(f1_score(vectors_labels[i], (y_pred_X>=hype_param_best[i]).astype(int))))

Training accuracy is 0.12421383647798745
Training accuracy is 0.2393635420269803
Training accuracy is 0.06997558991049634
Training accuracy is 0.2836990595611285
Training accuracy is 0.2074688796680498
Training accuracy is 0.4838800112139052
Training accuracy is 0.11806375442739081
Training accuracy is 0.16242155777039496
Training accuracy is 0.041212121212121214
Training accuracy is 0.15584415584415584
Training accuracy is 0.27841168416248285
Training accuracy is 0.5964517037454239
Training accuracy is 0.06716417910447761
Training accuracy is 0.1416361416361416
Training accuracy is 0.32265073329712113
Training accuracy is 0.09135399673735727
Training accuracy is 0.12031047865459248
Training accuracy is 0.23460410557184752


In [313]:
sorted_prediction__trainids = np.argsort(-submission_rating_train,axis=1)
top_10_prediction_trainids = sorted_prediction__trainids[:,:5]

In [309]:
def get_column_names_train(row):
    return list(vectors_labels.columns[row == 1])
vectors_labels_new=vectors_labels.apply(get_column_names_train,axis=1).tolist()

In [314]:
mapk(vectors_labels_new,top_10_prediction_trainids,k=5)

0.26427279177013124

## Test movie_test

In [77]:
vectors_genre_test=[]
vectors_labels_test=[]
tree = KDTree(x_train[list_train], leaf_size=2) 
for i in range(len(data2_unique)):
    data2_each=data2[data2["movieid"]==int(data2_unique["movieid"].iloc[i])]
    data2_each=data2_each[(data2_each["rating"]==5) | (data2_each["rating"]==4) ]
    if len(data2_each)!=0:##get movies having rating
        dist, ind_train = tree.query(data2_each[list_train], k=5)
        x_train_new=x_train.iloc[np.ravel(ind_train)]
        x_train_new_unique=x_train_new.drop_duplicates(subset="userid",ignore_index=True)
        x_train_final=data1[data1["userid"].isin(x_train_new_unique["userid"])] 
        x_train_final_high=x_train_final[(x_train_final["rating"]==5) | (x_train_final["rating"]==4)  ]
        genre_vector = np.zeros(len(genre2idx))
        for genre in x_train_final_high.genre.tolist():
            for g in genre:
                genre_vector[genre2idx[g]] += 1
        vectors_genre_test.append(genre_vector.astype(int))
        vectors_labels_test.append(data2_unique["genre_vectors"].iloc[i].tolist())

In [None]:
vectors_genre_test

In [78]:
vectors_labels_test=np.array(vectors_labels_test)
scaler_test=MinMaxScaler()
vectors_test=scaler_test.fit_transform(vectors_genre_test)
vectors_labels_test=pd.DataFrame(vectors_labels_test)

In [264]:
submission_rating=pd.DataFrame()
for i in range(vectors_labels_test.shape[1]):
    y_pred_X = model_rating[i].predict_proba(vectors_test)[:,1]
    submission_rating[i]=y_pred_X
    print('Training accuracy is {}'.format(f1_score(vectors_labels_test[i], (y_pred_X>=hype_param_best[i]).astype(int))))


Training accuracy is 0.09105691056910571
Training accuracy is 0.28415300546448086
Training accuracy is 0.044117647058823525
Training accuracy is 0.2395543175487465
Training accuracy is 0.17846153846153845
Training accuracy is 0.48577680525164113
Training accuracy is 0.07627118644067797
Training accuracy is 0.12982998454404945
Training accuracy is 0.03773584905660377
Training accuracy is 0.17808219178082194
Training accuracy is 0.25585585585585585
Training accuracy is 0.5832402234636872
Training accuracy is 0.05882352941176471
Training accuracy is 0.08139534883720931
Training accuracy is 0.2929061784897025
Training accuracy is 0.09688581314878894
Training accuracy is 0.10666666666666666
Training accuracy is 0.23529411764705882


In [318]:
sorted_prediction_ids = np.argsort(-submission_rating,axis=1)
top_10_prediction_ids = sorted_prediction_ids[:,:5]

In [302]:
def get_column_names(row):
    return list(vectors_labels_test.columns[row == 1])
vectors_labels_test_new=vectors_labels_test.apply(get_column_names,axis=1).tolist()

In [319]:
mapk(vectors_labels_test_new,top_10_prediction_ids,k=5)

0.2008636788048553

# Package code

In [None]:
import sys
import importlib
sys.path.insert(0, './params')
sys.path.insert(1, './metrics')
import param_rating
import map_at_k
importlib.reload(param_rating)
importlib.reload(map_at_k)
import pickle
from sklearn.neighbors import KDTree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import MinMaxScaler

In [137]:
class ModelByRating:
    def __init__(self,movie_train,movie_test,user,rating):
        self.movie_train=movie_train
        self.movie_test=movie_test
        self.user=user
        self.rating=rating
        df_1 = pd.merge(movie_train,rating,how='inner',on='movieid')
        df_2 = pd.merge(movie_test,rating,how='inner',on='movieid')
        self.data1=pd.merge(df_1,user,how='inner',on='userid')
        self.data2 = pd.merge(df_2,user,how='inner',on='userid')
        self.data1_unique=self.movie_train.copy()
        self.data1_unique.reset_index(inplace=True)
        self.data2_unique=self.movie_test.copy()
        self.data2_unique.reset_index(inplace=True)
        self.dataMapping={
            "M":1,
            "F":2
        }
        # Convert gender data into numeric values
        self.data1["gender"]=self.data1["gender"].map(self.dataMapping)
        
        self.x_train=self.data1.drop_duplicates(subset="userid",ignore_index=True)
        self.x_train=self.x_train.drop(["movieid","title","timestamp","zip"],axis=1)
        #Condition
        self.isTrained=False
        self.isPredict=False
        self.isPreprocess=False
    def __preprocess_data_train(self):
        vectors_genre=[]
        for genre in self.data1_unique.genre.tolist():
            genre_vector = np.zeros(len(param_rating.genre2idx))
            for g in genre:
                genre_vector[param_rating.genre2idx[g]] = 1
            vectors_genre.append(genre_vector)
        self.data1_unique['genre_vectors']=vectors_genre

        self.vectors_genre1=[]
        self.vectors_labels=[]
        tree = KDTree(self.x_train[param_rating.list_train], leaf_size=2) 
        for i in range(len(self.data1_unique)):
            data1_each=self.data1[self.data1["movieid"]==int(self.data1_unique["movieid"].iloc[i])]
            data1_each=data1_each[(data1_each["rating"]==5) | (data1_each["rating"]==4) ]
            if len(data1_each)!=0:##get movies having rating
                _, ind_train = tree.query(data1_each[param_rating.list_train], k=5)
                x_train_new1=self.x_train.iloc[np.ravel(ind_train)]
                x_train_new_unique1=x_train_new1.drop_duplicates(subset="userid",ignore_index=True)
                x_train_final1=self.data1[self.data1["userid"].isin(x_train_new_unique1["userid"])] 
                x_train_final_high1=x_train_final1[(x_train_final1["rating"]==5) | (x_train_final1["rating"]==4)]
                genre_vector1 = np.zeros(len(param_rating.genre2idx))
                for genre in x_train_final_high1.genre.tolist():
                    for g in genre:
                        genre_vector1[param_rating.genre2idx[g]] += 1
                self.vectors_genre1.append(genre_vector1.astype(int))
                self.vectors_labels.append(self.data1_unique["genre_vectors"].iloc[i].tolist())
        
        self.vectors_labels=pd.DataFrame(self.vectors_labels)
        vectors_genre_table=pd.DataFrame(self.vectors_genre1)
        scaler=MinMaxScaler()
        self.vectors_train=scaler.fit_transform(vectors_genre_table)
    def __preprocess_data_test(self):
        self.data2["gender"]=self.data2["gender"].map(self.dataMapping)
        vectors_genre=[]
        for genre in self.data2_unique.genre.tolist():
            genre_vector = np.zeros(len(param_rating.genre2idx))
            for g in genre:
                genre_vector[param_rating.genre2idx[g]] = 1
            vectors_genre.append(genre_vector)
        self.data2_unique['genre_vectors']=vectors_genre

        self.vectors_genre_test=[]
        self.vectors_labels_test=[]
        tree = KDTree(self.x_train[param_rating.list_train], leaf_size=2) 
        for i in range(len(self.data2_unique)):
            data2_each=self.data2[self.data2["movieid"]==int(self.data2_unique["movieid"].iloc[i])]
            data2_each=data2_each[(data2_each["rating"]==5) | (data2_each["rating"]==4) ]
            if len(data2_each)!=0:##get movies having rating
                _, ind_train = tree.query(data2_each[param_rating.list_train], k=5)
                x_train_new=x_train.iloc[np.ravel(ind_train)]
                x_train_new_unique=x_train_new.drop_duplicates(subset="userid",ignore_index=True)
                x_train_final=self.data1[self.data1["userid"].isin(x_train_new_unique["userid"])] 
                x_train_final_high=x_train_final[(x_train_final["rating"]==5) | (x_train_final["rating"]==4)]
                genre_vector = np.zeros(len(param_rating.genre2idx))
                for genre in x_train_final_high.genre.tolist():
                    for g in genre:
                        genre_vector[param_rating.genre2idx[g]] += 1
                self.vectors_genre_test.append(genre_vector.astype(int))
                self.vectors_labels_test.append(self.data2_unique["genre_vectors"].iloc[i].tolist())
        
        self.vectors_labels_test=np.array(self.vectors_labels_test)
        self.vectors_labels_test=pd.DataFrame(self.vectors_labels_test)
        scaler_test=MinMaxScaler()
        self.vectors_test=scaler_test.fit_transform(self.vectors_genre_test)
    def preprocess_data(self):
        self.isPreprocess=True

        self.__preprocess_data_train()
        self.__preprocess_data_test()
    def train_model(self):
        if self.isPreprocess==False:
            raise Exception('preprocess_data() needs to be proceeded first!')
        self.isTrained=True

        model_rating=[]
        hype_param_best=[]
        for label in range(self.vectors_labels.shape[1]):
            logreg = LogisticRegression(C=0.5)
            smote=BorderlineSMOTE(random_state=27,k_neighbors=5)
            smote_x_train,smote_y_train=smote.fit_resample(self.vectors_train,self.vectors_labels[label])
            print('... Processing {}'.format(label))
            # train the model using X_dtm & y
            logreg.fit(smote_x_train, smote_y_train)
            model_rating.append(logreg)
            # compute the training accuracy
            y_pred_X = logreg.predict_proba(smote_x_train)[:,1]
            y_pred_X1 = logreg.predict(smote_x_train)
            print('Training accuracy(before) is {}'.format(f1_score(smote_y_train, y_pred_X1)))
            hype_param=0
            interval=0.01
            maxScore=0
            for _ in range(990):
                interval+=0.001
                y_pred_new=(y_pred_X>=interval).astype(int)
                temp_score=f1_score(smote_y_train,y_pred_new)
                if temp_score>maxScore:
                    maxScore=temp_score
                    hype_param=interval
            print(f"{label}: maxScore: {maxScore} best T:{hype_param}")
            hype_param_best.append(hype_param)
            print('Training accuracy(after) is {}'.format(f1_score(smote_y_train, (y_pred_X>=hype_param).astype(int))))

        with open('./trained_model_params/modelByRating.pkl', 'wb') as file:
            pickle.dump(model_rating, file)
        with open('./trained_model_params/hyperparamByRating.pkl', 'wb') as file:
            pickle.dump(hype_param_best, file)
    def predict(self):
        if self.isTrained==False and self.isPreprocess==True:
            raise Exception('train_model() needs to be proceeded!')
        if self.isPreprocess==False:
            raise Exception('preprocess_data() needs to be proceeded first!')
        self.isPredict=True
        
        self.submission_rating=pd.DataFrame()
        hype_param_best=pickle.load(open('./trained_model_params/hyperparamByRating.pkl', 'rb'))
        model_rating=pickle.load(open('./trained_model_params/modelByRating.pkl','rb'))
        for i in range(self.vectors_labels_test.shape[1]):
            y_pred_X = model_rating[i].predict_proba(self.vectors_test)[:,1]
            self.submission_rating[i]=y_pred_X
            print('Training accuracy is {}'.format(f1_score(vectors_labels_test[i], (y_pred_X>=hype_param_best[i]).astype(int))))
    def get_column_names(self,row):
        return list(self.vectors_labels_test.columns[row == 1])
    def evaluate_model(self):
        if self.isTrained==False and self.isPreprocess==True:
            raise Exception('train_model() needs to be proceeded!')
        if self.isPreprocess==False:
            raise Exception('preprocess_data() needs to be proceeded first!')
        if self.isPredict==False:
            self.predict()
        sorted_prediction_ids = np.argsort(-self.submission_rating,axis=1)
        top_5_prediction_ids = sorted_prediction_ids[:,:5]
        vectors_labels_test_new=vectors_labels_test.apply(self.get_column_names,axis=1).tolist()
        print(mapk(vectors_labels_test_new,top_5_prediction_ids,k=5))


In [138]:
model1=ModelByRating(movies_train,movies_test,users,ratings)

In [140]:
model1.preprocess_data()

In [142]:
model1.train_model()

... Processing 0
Training accuracy(before) is 0.648598820058997
0: maxScore: 0.6911290322580645 best T:0.3930000000000003
Training accuracy(after) is 0.6911290322580645
... Processing 1
Training accuracy(before) is 0.6441933691424339
1: maxScore: 0.6727688787185355 best T:0.34600000000000025
Training accuracy(after) is 0.6727688787185355
... Processing 2
Training accuracy(before) is 0.7375690607734807
2: maxScore: 0.7494363929146538 best T:0.4280000000000003
Training accuracy(after) is 0.7494363929146538
... Processing 3
Training accuracy(before) is 0.7760057992026095
3: maxScore: 0.7809319764327799 best T:0.4860000000000004
Training accuracy(after) is 0.7809319764327799
... Processing 4
Training accuracy(before) is 0.7399656946826758
4: maxScore: 0.7406137493571061 best T:0.4990000000000004
Training accuracy(after) is 0.7406137493571061
... Processing 5
Training accuracy(before) is 0.5691226243165842
5: maxScore: 0.6724351050679852 best T:0.3800000000000003
Training accuracy(after) is

In [130]:
model1.predict()

Training accuracy is 0.09105691056910571
Training accuracy is 0.28415300546448086
Training accuracy is 0.044117647058823525
Training accuracy is 0.2395543175487465
Training accuracy is 0.17846153846153845
Training accuracy is 0.48577680525164113
Training accuracy is 0.07627118644067797
Training accuracy is 0.12982998454404945
Training accuracy is 0.03773584905660377
Training accuracy is 0.17808219178082194
Training accuracy is 0.25585585585585585
Training accuracy is 0.5832402234636872
Training accuracy is 0.05882352941176471
Training accuracy is 0.08139534883720931
Training accuracy is 0.2929061784897025
Training accuracy is 0.09688581314878894
Training accuracy is 0.10666666666666666
Training accuracy is 0.23529411764705882


In [131]:
model1.evaluate_model()

0.25033535636476817


In [147]:
list(genre2idx.keys())[list(genre2idx.values()).index(0)]

'Crime'

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1.9) #LogisticRegression(C=1.44)
submission_binary = pd.DataFrame()
hype_param_best=[]

for label in range(vectors_labels.shape[1]):
    hype_param=0
    interval=0.01
    maxScore=0
    print('... Processing {}'.format(label))
    y=vectors_labels[label]
    y_pred=vectors_train.T[label]
    for i in range(990):
        interval+=0.001
        y_pred_new=(y_pred>=interval).astype(int)
        temp_score=f1_score(y,y_pred_new)
        if temp_score>maxScore:
            maxScore=temp_score
            hype_param=interval
    print(f"{label}: maxScore: {maxScore} best T:{hype_param}")
    hype_param_best.append(hype_param)


In [33]:
vectors_genre1=[]
genre_vector1 = np.zeros(len(genre2idx))
for genre in x_train_final_high1.genre.tolist():
    for g in genre:
        genre_vector1[genre2idx[g]] += 1
vectors_genre1.append(genre_vector1.astype(int))
vectors_genre1

[array([1037, 2600,  398,  752, 1741, 3809,   98, 1506,  304,  487, 2297,
        5927,  278,  614, 2823,  587,  905,  871])]

In [15]:
data2_each=data2_each[(data2_each["rating"]==5) | (data2_each["rating"]==4) ]

In [16]:
data1_each=data1_each[(data1_each["rating"]==5) | (data1_each["rating"]==4) ]

In [20]:
data2_each

Unnamed: 0,movieid,title,genre,userid,rating,timestamp,gender,age,occupation,zip
0,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",18,5,978155207,2,18,3,95825
46,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",31,4,978121584,1,56,7,06840
138,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",44,5,978020416,1,45,17,98052
282,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",92,4,986189305,2,18,4,44243
360,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",101,5,977577431,2,18,3,33314
...,...,...,...,...,...,...,...,...,...,...
30108,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",5957,4,957075694,1,18,20,02038
30143,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",5964,5,956997297,1,18,5,97202
30282,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",5978,4,964186018,1,35,1,49307
30400,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",5990,4,956868789,2,25,20,90046


In [46]:
data1_each

Unnamed: 0,movieid,title,genre,userid,rating,timestamp,gender,age,occupation,zip
374,1650,Washington Square (1997),[Drama],124,5,977446128,1,56,7,91356
391,1650,Washington Square (1997),[Drama],136,4,977422364,1,18,2,21202
705,1650,Washington Square (1997),[Drama],187,4,977234181,2,45,1,94061
1045,1650,Washington Square (1997),[Drama],195,4,977454799,1,25,12,10458
2738,1650,Washington Square (1997),[Drama],590,4,975913510,2,35,6,98032
2870,1650,Washington Square (1997),[Drama],593,4,975900657,2,50,1,91711
5184,1650,Washington Square (1997),[Drama],1078,5,974939470,2,45,9,95661
6347,1650,Washington Square (1997),[Drama],1203,4,975174809,2,25,1,6417
7508,1650,Washington Square (1997),[Drama],1448,4,976206610,2,25,3,17522
9245,1650,Washington Square (1997),[Drama],1897,4,1037584922,2,50,2,94530


In [17]:
tree = KDTree(x_train[list_train], leaf_size=2) 

dist, ind = tree.query(data2_each[list_train], k=5)


In [18]:
dist, ind_train = tree.query(data1_each[list_train], k=5)

In [22]:
np.ravel(ind).shape

(1145,)

In [23]:
x_train

Unnamed: 0,genre,userid,rating,gender,age,occupation
0,[Drama],5,3,1,25,20
1,[Drama],19,1,1,1,10
2,[Drama],124,5,1,56,7
3,[Drama],136,4,1,18,2
4,[Drama],187,4,2,45,1
...,...,...,...,...,...,...
6035,[Horror],642,3,2,35,1
6036,[Comedy],5919,3,1,18,0
6037,"[Action, Adventure]",311,5,1,18,4
6038,"[Horror, Thriller]",5828,1,1,25,17


In [19]:
x_train_new=x_train.iloc[np.ravel(ind)]

In [56]:
x_train


Unnamed: 0,genre,userid,rating,gender,age,occupation
0,[Drama],5,3,1,25,20
1,[Drama],19,1,1,1,10
2,[Drama],124,5,1,56,7
3,[Drama],136,4,1,18,2
4,[Drama],187,4,2,45,1
...,...,...,...,...,...,...
6035,[Horror],642,3,2,35,1
6036,[Comedy],5919,3,1,18,0
6037,"[Action, Adventure]",311,5,1,18,4
6038,"[Horror, Thriller]",5828,1,1,25,17


In [55]:
x_train_new

Unnamed: 0,genre,userid,rating,gender,age,occupation
991,"[Action, Adventure, Comedy, Crime]",3349,5,2,18,4
1653,[Comedy],1697,5,2,18,4
1640,[Comedy],1535,5,2,18,20
876,"[Action, Adventure, Comedy, Crime]",2188,5,2,18,19
1611,[Comedy],1232,5,2,18,4
...,...,...,...,...,...,...
28,[Drama],3067,4,2,25,0
122,"[Sci-Fi, Thriller]",854,4,2,25,16
18,[Drama],1448,4,2,25,3
16,[Drama],1203,4,2,25,1


In [20]:
x_train_new1=x_train.iloc[np.ravel(ind_train)]

In [21]:
x_train_new_unique=x_train_new.drop_duplicates(subset="userid",ignore_index=True)


In [22]:
x_train_new_unique1=x_train_new1.drop_duplicates(subset="userid",ignore_index=True)

In [23]:
x_train_final=data1[data1["userid"].isin(x_train_new_unique["userid"])] 

In [24]:
x_train_final1=data1[data1["userid"].isin(x_train_new_unique1["userid"])] 

In [25]:
x_train_final_high=x_train_final[(x_train_final["rating"]==5) | (x_train_final["rating"]==4)  ]
x_train_final_low=x_train_final[(x_train_final["rating"]!=5) & (x_train_final["rating"]!=4) ]

In [26]:
x_train_final_high1=x_train_final1[(x_train_final1["rating"]==5) | (x_train_final1["rating"]==4)  ]

In [29]:
x_train_final_low["rating"].value_counts()

rating
3    8999
2    2934
1    1441
Name: count, dtype: int64

In [30]:
data[genre2idx.keys()].sum().astype(int).sum()

5178

In [31]:
cof=np.divide(3106, data[genre2idx.keys()].sum().astype(int))
cof

Crime          17.255556
Thriller        8.046632
Fantasy        50.918033
Horror         11.589552
Sci-Fi         13.622807
Comedy          3.259182
Documentary    32.020619
Adventure      13.217021
Film-Noir      81.736842
Animation      36.976190
Romance         8.238727
Drama           2.400309
Western        57.518519
Musical        30.752475
Action          7.520581
Mystery        35.295455
War            26.322034
Children's     15.300493
dtype: float64

In [32]:
vectors_genre=[]
genre_vector = np.zeros(len(genre2idx))
for genre in x_train_final_high.genre.tolist():
    for g in genre:
        genre_vector[genre2idx[g]] += 1
vectors_genre.append(genre_vector.astype(int))
vectors_genre

[array([1621, 4194,  830, 1215, 3261, 6476,  133, 2880,  406,  931, 3305,
        8099,  410, 1020, 5096,  859, 1471, 1697])]

In [34]:
vectors_genre_low=[]
genre_vector_low = np.zeros(len(genre2idx))
for genre in x_train_final_low.genre.tolist():
    for g in genre:
        genre_vector_low[genre2idx[g]] += 1
vectors_genre_low.append(genre_vector_low.astype(int))
vectors_genre_low

[array([ 913, 2506,  678, 1366, 2277, 4690,   48, 2048,  120,  503, 1927,
        4212,  294,  565, 3470,  502,  596, 1306])]

In [35]:
sum_array=np.subtract(vectors_genre,vectors_genre_low)
sum_array

array([[ 708, 1688,  152, -151,  984, 1786,   85,  832,  286,  428, 1378,
        3887,  116,  455, 1626,  357,  875,  391]])

In [36]:
vectors_genre*np.array(cof)

array([[27971.25555556, 33747.57512953, 42261.96721311, 14081.30597015,
        44423.97368421, 21106.45960126,  4258.74226804, 38065.0212766 ,
        33185.15789474, 34424.83333333, 27228.99204244, 19440.10355487,
        23582.59259259, 31367.52475248, 38324.88135593, 30318.79545455,
        38719.71186441, 25964.93596059]])

In [38]:
genre2idx

{'Crime': 0,
 'Thriller': 1,
 'Fantasy': 2,
 'Horror': 3,
 'Sci-Fi': 4,
 'Comedy': 5,
 'Documentary': 6,
 'Adventure': 7,
 'Film-Noir': 8,
 'Animation': 9,
 'Romance': 10,
 'Drama': 11,
 'Western': 12,
 'Musical': 13,
 'Action': 14,
 'Mystery': 15,
 'War': 16,
 "Children's": 17}

In [None]:
def normalize(values):
    mn = values.min()
    mx = values.max()
    return(10.0/(mx - mn) * (values - mx)+10)

In [None]:
movies_rating = ratings.groupby('movieid')['rating'].mean()
movies_rating

movieid
1       4.146846
2       3.201141
3       3.016736
4       2.729412
5       3.006757
          ...   
3948    3.635731
3949    4.115132
3950    3.666667
3951    3.900000
3952    3.780928
Name: rating, Length: 3706, dtype: float64

In [None]:
movies_rating_count1 = ratings.groupby('movieid')['rating'].count()
unique_movies_list = ratings.movieid.unique()
movies_rating_count = pd.DataFrame({'rating_count': movies_rating_count1[unique_movies_list]})

movies_rating_count.head()

Unnamed: 0_level_0,rating_count
movieid,Unnamed: 1_level_1
1193,1725
661,525
914,636
3408,1315
2355,1703


In [None]:
unreliability1 = ratings.groupby('movieid')['rating'].std(ddof = -1)
unique_movies_list = ratings.movieid.unique()
unreliability = pd.DataFrame({'unreliability': unreliability1[unique_movies_list]})
unreliability.head()

Unnamed: 0_level_0,unreliability
movieid,Unnamed: 1_level_1
1193,0.789066
661,1.021255
914,0.872481
3408,0.895206
2355,0.878768


In [None]:
data1 = pd.merge(data1,movies_rating_count,how='inner',on='movieid')
data2 = pd.merge(data2,movies_rating_count,how='inner',on='movieid')

data1

Unnamed: 0,movieid,title,genre,userid,rating,timestamp,gender,age,occupation,zip,rating_count
0,1650,Washington Square (1997),[Drama],5,3,978245314,M,25,20,55455,50
1,1650,Washington Square (1997),[Drama],19,1,978554433,M,1,10,48073,50
2,1650,Washington Square (1997),[Drama],124,5,977446128,M,56,7,91356,50
3,1650,Washington Square (1997),[Drama],136,4,977422364,M,18,2,21202,50
4,1650,Washington Square (1997),[Drama],187,4,977234181,F,45,1,94061,50
...,...,...,...,...,...,...,...,...,...,...,...
817419,3890,Back Stage (2000),[Documentary],2104,3,974652640,M,45,15,60035,1
817420,1579,For Ever Mozart (1996),[Drama],511,3,976206196,F,45,4,15232,1
817421,1915,Voyage to the Beginning of the World (1997),[Drama],2440,4,974221429,M,50,17,01430,1
817422,624,Condition Red (1995),"[Action, Drama, Thriller]",4874,4,962781918,F,25,4,70808,1


In [None]:
data1 = pd.merge(data1,unreliability,how='inner',on='movieid')
data2 = pd.merge(data2,unreliability,how='inner',on='movieid')
data1['ratings'] = normalize(data1[['rating']])
data2['ratings'] = normalize(data2[['rating']])
data1 = data1[data1['ratings']>2]
data2 = data1[data1['ratings']>2]
data1 = data1[data1['rating_count']>7]
data2 = data1[data1['rating_count']>7]
print(type(data1['title']))

<class 'pandas.core.series.Series'>


In [None]:
data_model_train = data1[['rating']]
data_model_train[['age']] = data1[['age']]
data_model_train[['occupation']] = data1[['occupation']]
#data_model_train['rating_count'] = data1['rating_count']
#data_model_train['unreliability'] = data1['unreliability']
data_model_train


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_model_train[['age']] = data1[['age']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_model_train[['occupation']] = data1[['occupation']]


Unnamed: 0,rating,age,occupation
0,3,25,20
2,5,56,7
3,4,18,2
4,4,45,1
5,4,25,12
...,...,...,...
817351,4,35,1
817352,3,45,2
817353,4,25,2
817355,4,35,0


In [None]:
data_model_test = data2[['rating']]
data_model_test[['age']] = data2[['age']]
data_model_test[['occupation']] = data2[['occupation']]
#data_model_test['rating_count'] = data2['rating_count']
#data_model_test['unreliability'] = data2['unreliability']
data_model_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_model_test[['age']] = data2[['age']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_model_test[['occupation']] = data2[['occupation']]


Unnamed: 0,rating,age,occupation
0,3,25,20
2,5,56,7
3,4,18,2
4,4,45,1
5,4,25,12
...,...,...,...
817351,4,35,1
817352,3,45,2
817353,4,25,2
817355,4,35,0


In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scaler_data1 = scale.fit_transform(data_model_train)
scaler_data1 = data_model_train

In [None]:
scaler_data2 = scale.transform(data_model_test)
scaler_data2 = data_model_test

In [None]:
with open('/content/drive/MyDrive/INT3405E Final project/ml1m/content/dataset/genres.txt', 'r') as f:
            genre_all = f.readlines()
            genre_all = [x.replace('\n','') for x in genre_all]
genre2idx = {genre:idx for idx, genre in enumerate(genre_all)}
genre_all


['Crime',
 'Thriller',
 'Fantasy',
 'Horror',
 'Sci-Fi',
 'Comedy',
 'Documentary',
 'Adventure',
 'Film-Noir',
 'Animation',
 'Romance',
 'Drama',
 'Western',
 'Musical',
 'Action',
 'Mystery',
 'War',
 "Children's"]

In [None]:

df1 = pd.DataFrame(False, index=range(len(data1['genre'])), columns=genre_all)
for i, genres in enumerate(data1['genre']):
    df1.loc[i, genres] = True


In [None]:
df2 = pd.DataFrame(False, index=range(len(data2['genre'])), columns=genre_all)
for i, genres in enumerate(data2['genre']):
    df2.loc[i, genres] = True

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test,y_train , y_test = train_test_split(scaler_data1, df1, test_size=0.3, random_state = 42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train['Drama'])

(539882, 3)
(539882, 18)
(231378, 3)
(231378, 18)
707851    False
277662    False
624033    False
213218    False
656        True
          ...  
259178    False
365838    False
131932    False
671155     True
121958    False
Name: Drama, Length: 539882, dtype: bool


In [None]:
list_max_tree = []
list_acc_score = []
list_f1_score = []
for genre in genre_all:
    label1 = y_train[genre]
    label2 = y_test[genre]
    list_acc = []
    list_f1 = []
    list_tree = []
    max_acc = -1.0
    best_tree = None
    for i in range(5, 20):
        tree = DecisionTreeClassifier(max_depth=i)
        tree.fit(X_train, label1)
        y_pred = tree.predict(X_test)
        acc_score = accuracy_score(y_pred, label2)
        f1_scor = f1_score(y_pred, label2)
        list_acc.append(acc_score)
        list_f1.append(f1_scor)
        if acc_score > max_acc:
            max_acc = acc_score
            best_tree = tree
    temp = {genre:best_tree}
    print(temp)
    list_max_tree.append(temp)
    list_acc_score.append(list_acc)
    list_f1_score.append(list_f1)

{'Crime': DecisionTreeClassifier(max_depth=5)}
{'Thriller': DecisionTreeClassifier(max_depth=5)}
{'Fantasy': DecisionTreeClassifier(max_depth=5)}
{'Horror': DecisionTreeClassifier(max_depth=11)}
{'Sci-Fi': DecisionTreeClassifier(max_depth=5)}
{'Comedy': DecisionTreeClassifier(max_depth=11)}
{'Documentary': DecisionTreeClassifier(max_depth=5)}
{'Adventure': DecisionTreeClassifier(max_depth=5)}
{'Film-Noir': DecisionTreeClassifier(max_depth=5)}
{'Animation': DecisionTreeClassifier(max_depth=5)}
{'Romance': DecisionTreeClassifier(max_depth=5)}
{'Drama': DecisionTreeClassifier(max_depth=8)}
{'Western': DecisionTreeClassifier(max_depth=5)}
{'Musical': DecisionTreeClassifier(max_depth=5)}
{'Action': DecisionTreeClassifier(max_depth=8)}
{'Mystery': DecisionTreeClassifier(max_depth=5)}
{'War': DecisionTreeClassifier(max_depth=5)}
{"Children's": DecisionTreeClassifier(max_depth=5)}


In [None]:
print(list_max_tree)
print(list_acc_score)
print(list_f1_score)

[{'Crime': DecisionTreeClassifier(max_depth=5)}, {'Thriller': DecisionTreeClassifier(max_depth=5)}, {'Fantasy': DecisionTreeClassifier(max_depth=5)}, {'Horror': DecisionTreeClassifier(max_depth=11)}, {'Sci-Fi': DecisionTreeClassifier(max_depth=5)}, {'Comedy': DecisionTreeClassifier(max_depth=11)}, {'Documentary': DecisionTreeClassifier(max_depth=5)}, {'Adventure': DecisionTreeClassifier(max_depth=5)}, {'Film-Noir': DecisionTreeClassifier(max_depth=5)}, {'Animation': DecisionTreeClassifier(max_depth=5)}, {'Romance': DecisionTreeClassifier(max_depth=5)}, {'Drama': DecisionTreeClassifier(max_depth=8)}, {'Western': DecisionTreeClassifier(max_depth=5)}, {'Musical': DecisionTreeClassifier(max_depth=5)}, {'Action': DecisionTreeClassifier(max_depth=8)}, {'Mystery': DecisionTreeClassifier(max_depth=5)}, {'War': DecisionTreeClassifier(max_depth=5)}, {"Children's": DecisionTreeClassifier(max_depth=5)}]
[[0.9136434751791441, 0.9136434751791441, 0.9136434751791441, 0.9136434751791441, 0.91364347517

In [None]:
list_predict = pd.DataFrame()
for tree in list_max_tree:
    tree_name = list(tree.keys())[0]
    label = df2[tree_name]
    y_pred = tree[tree_name].predict(scaler_data2)
    list_predict[tree_name] = y_pred
print(accuracy_score(list_predict, df2))


0.003442418898944584
