In [12]:
import requests
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
dtr_model_1 = DecisionTreeRegressor(random_state=1)
dtr_model_2 = DecisionTreeRegressor(criterion="friedman_mse",random_state=1)
dtr_model_3 = DecisionTreeRegressor(criterion="absolute_error",random_state=1)
dtr_model_4 = DecisionTreeRegressor(criterion="poisson",random_state=1)

forest_model_1 = RandomForestRegressor(n_estimators=50,criterion="absolute_error",random_state=1)
forest_model_2 = RandomForestRegressor(n_estimators=100,random_state=1)
forest_model_3 = RandomForestRegressor(n_estimators=100,criterion="friedman_mse",random_state=1)
forest_model_4 = RandomForestRegressor(n_estimators=100,criterion="absolute_error",random_state=1)
forest_model_5 = RandomForestRegressor(n_estimators=200,criterion="absolute_error",random_state=1)

dtr_models = [dtr_model_1,dtr_model_2,dtr_model_3,dtr_model_4]
forest_models = [forest_model_1,forest_model_2,forest_model_3,forest_model_4,forest_model_5]

encoder = LabelEncoder()

In [3]:
my_rating_df = pd.read_csv('ratings.csv')
my_rating_df.head()

Unnamed: 0,Date,Name,Year,Letterboxd URI,Rating
0,2021-11-29,Violet,2021,https://boxd.it/oMCE,4.0
1,2021-11-29,Hot Fuzz,2007,https://boxd.it/2416,3.5
2,2021-11-29,The World's End,2013,https://boxd.it/3EiO,3.0
3,2021-11-29,Scott Pilgrim vs. the World,2010,https://boxd.it/1Aq6,4.0
4,2021-11-29,La La Land,2016,https://boxd.it/a5fa,3.0


In [4]:
import json

with open("config.json","r") as file:
    config = json.load(file)

In [5]:
# TMDB API requests
API_KEY = config['TMDB_API_KEY']
# Selecionando a series contendo os nomes de filmes do meu dataframe
my_filmes_name = my_rating_df['Name']
tmdb_rating = []

# Header
headers = {
        "Authorization": f"Bearer {config['TMDB_ACCESS_TOKEN']}"
    }

In [6]:

for filme in my_filmes_name:
    search_url = f"https://api.themoviedb.org/3/search/movie?query={filme}&language=pt=BR"
    response_search = requests.get(search_url,headers=headers)
    
    if response_search.status_code == 200:
        data_search = response_search.json()
        if data_search['results']:
            # Seleciona o primeiro resultado da busca
            movie = data_search['results'][0]
            movie_id = movie['id']
            
            # Chamada para obter mais detalhes do filme
            movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=pt-BR"
            response_movie = requests.get(movie_url,headers=headers)
            
            if response_movie.status_code == 200:
                data_movie = response_movie.json()
                movie.update({key: data_movie.get(key) for key in ['runtime', 'budget', 'revenue']})
                
                production_companies = data_movie.get('production_companies',[])
                
                if production_companies:
                    # Pegando apenas o ID da produtora
                    movie['production_company_id'] = production_companies[0].get('id')
                else:
                    movie['production_company_id'] = None
            else:
                movie['runtime'] = None
                movie['budget'] = None
                movie['revenue'] = None
                
            tmdb_rating.append(movie)
        else:
            print(f"Nenhum resultado encontrado para: {filme}")
    else:
        print(f"Erro na busca pelo filme {filme}: {response_search.status_code}")

        
tmdb_rating_df = pd.DataFrame(tmdb_rating)


In [61]:
tmdb_rating_df.to_csv('my_movies_tmdb.csv', index=False)

In [7]:
# Transformando a variável original_language em numérica
tmdb_rating_df['original_language'] = encoder.fit_transform(tmdb_rating_df['original_language'])

# Colocando a coluna minhas_notas e ano no dataframe tmdb
for i,filme in my_rating_df.iterrows():
    nome_f = filme['Name']
    nota = filme['Rating']
    ano = filme['Year']
    if nome_f in tmdb_rating_df['title'].values:
        tmdb_rating_df.loc[tmdb_rating_df['title'] == nome_f,'year'] = ano
        tmdb_rating_df.loc[tmdb_rating_df['title'] == nome_f,'minhas_notas'] = nota

#Escolhendo as variáveis de interesse
features = ['genre_ids','original_language','popularity','vote_average','vote_count',
            'runtime','budget','revenue','production_company_id','year','minhas_notas']
tmdb_feature_df = tmdb_rating_df[features]


tmdb_feature_df.head()

Unnamed: 0,genre_ids,original_language,popularity,vote_average,vote_count,runtime,budget,revenue,production_company_id,year,minhas_notas
0,[18],2,3.332,7.0,25,92,0,0,20443.0,2021.0,4.0
1,"[80, 28, 35]",2,36.204,7.562,7681,121,12000000,80600000,443.0,2007.0,3.5
2,"[35, 28, 878]",2,24.066,6.8,5466,109,20000000,46100000,443.0,2013.0,3.0
3,"[28, 35, 10749]",2,40.15,7.491,7985,113,85000000,51691156,2527.0,2010.0,4.0
4,"[35, 18, 10749, 10402]",2,76.745,7.9,17108,128,30000000,447407695,491.0,2016.0,3.0


In [8]:
tmdb_feature_df

Unnamed: 0,genre_ids,original_language,popularity,vote_average,vote_count,runtime,budget,revenue,production_company_id,year,minhas_notas
0,[18],2,3.332,7.000,25,92,0,0,20443.0,2021.0,4.0
1,"[80, 28, 35]",2,36.204,7.562,7681,121,12000000,80600000,443.0,2007.0,3.5
2,"[35, 28, 878]",2,24.066,6.800,5466,109,20000000,46100000,443.0,2013.0,3.0
3,"[28, 35, 10749]",2,40.150,7.491,7985,113,85000000,51691156,2527.0,2010.0,4.0
4,"[35, 18, 10749, 10402]",2,76.745,7.900,17108,128,30000000,447407695,491.0,2016.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...
799,"[18, 36]",2,10.093,6.500,639,132,0,0,20580.0,2021.0,1.0
800,"[18, 10749]",5,11.611,7.600,985,106,0,0,53.0,1962.0,2.0
801,"[28, 35, 12]",2,92.355,7.500,8781,132,185000000,168717425,128064.0,2021.0,2.0
802,"[18, 10402]",2,33.675,7.100,7361,111,41000000,242875078,24.0,2002.0,2.0


In [9]:
# Criando uma coluna para cada genre_id para que seja usada como uma variável numérica
all_genres_dif = set([genre for sublist in tmdb_feature_df['genre_ids'] for genre in sublist])
for genre in all_genres_dif:
    tmdb_feature_df.loc[:,f'genre_{genre}'] = tmdb_feature_df['genre_ids'].apply(lambda x: 1 if genre in x else 0)
tmdb_feature_df = tmdb_feature_df.drop(columns=['genre_ids'])
tmdb_feature_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmdb_feature_df.loc[:,f'genre_{genre}'] = tmdb_feature_df['genre_ids'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmdb_feature_df.loc[:,f'genre_{genre}'] = tmdb_feature_df['genre_ids'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

Unnamed: 0,original_language,popularity,vote_average,vote_count,runtime,budget,revenue,production_company_id,year,minhas_notas,...,genre_35,genre_36,genre_37,genre_9648,genre_53,genre_80,genre_99,genre_878,genre_10749,genre_10751
0,2,3.332,7.000,25,92,0,0,20443.0,2021.0,4.0,...,0,0,0,0,0,0,0,0,0,0
1,2,36.204,7.562,7681,121,12000000,80600000,443.0,2007.0,3.5,...,1,0,0,0,0,1,0,0,0,0
2,2,24.066,6.800,5466,109,20000000,46100000,443.0,2013.0,3.0,...,1,0,0,0,0,0,0,1,0,0
3,2,40.150,7.491,7985,113,85000000,51691156,2527.0,2010.0,4.0,...,1,0,0,0,0,0,0,0,1,0
4,2,76.745,7.900,17108,128,30000000,447407695,491.0,2016.0,3.0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799,2,10.093,6.500,639,132,0,0,20580.0,2021.0,1.0,...,0,1,0,0,0,0,0,0,0,0
800,5,11.611,7.600,985,106,0,0,53.0,1962.0,2.0,...,0,0,0,0,0,0,0,0,1,0
801,2,92.355,7.500,8781,132,185000000,168717425,128064.0,2021.0,2.0,...,1,0,0,0,0,0,0,0,0,0
802,2,33.675,7.100,7361,111,41000000,242875078,24.0,2002.0,2.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Uma função que cria um dataframe para a nota que queremos reavaliar e outra para o restante do data frame
def nota_alvo(df: pd.DataFrame ,rating: float):
    filmes_nota_alvo = df.loc[df['minhas_notas']==rating]
    filmes_dif_nota_alvo = df.loc[~df['minhas_notas'].isin(filmes_nota_alvo['minhas_notas'])]
    return (filmes_dif_nota_alvo,filmes_nota_alvo) 

In [25]:
K = tmdb_feature_df.dropna()
l = K.minhas_notas
K = K.drop(columns=['minhas_notas'])

K_train,K_valid,l_train,l_valid = train_test_split(K,l, train_size = 0.8, test_size = 0.2, random_state=0)

In [27]:

def score_model(model, X_t=K_train, X_v=K_valid, y_t=l_train, y_v=l_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(dtr_models)):
    mae = score_model(dtr_models[i])
    print("Model %d MAE: %d" % (i+1, mae))

for i in range(0, len(forest_models)):
    mae = score_model(forest_models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 1
Model 2 MAE: 1
Model 3 MAE: 1
Model 4 MAE: 1
Model 1 MAE: 0
Model 2 MAE: 0
Model 3 MAE: 0
Model 4 MAE: 0
Model 5 MAE: 0


In [50]:
X,y = nota_alvo(df=tmdb_feature_df, rating=1.5)
X.isna().sum()

original_language         0
popularity                0
vote_average              0
vote_count                0
runtime                   0
budget                    0
revenue                   0
production_company_id    24
year                     30
minhas_notas             30
genre_10752               0
genre_12                  0
genre_14                  0
genre_16                  0
genre_18                  0
genre_10770               0
genre_27                  0
genre_28                  0
genre_10402               0
genre_35                  0
genre_36                  0
genre_37                  0
genre_9648                0
genre_53                  0
genre_80                  0
genre_99                  0
genre_878                 0
genre_10749               0
genre_10751               0
dtype: int64

In [51]:
X = X.dropna()
test_X = X.drop(columns=['minhas_notas'])
test_y = X['minhas_notas']
X.isna().sum()

original_language        0
popularity               0
vote_average             0
vote_count               0
runtime                  0
budget                   0
revenue                  0
production_company_id    0
year                     0
minhas_notas             0
genre_10752              0
genre_12                 0
genre_14                 0
genre_16                 0
genre_18                 0
genre_10770              0
genre_27                 0
genre_28                 0
genre_10402              0
genre_35                 0
genre_36                 0
genre_37                 0
genre_9648               0
genre_53                 0
genre_80                 0
genre_99                 0
genre_878                0
genre_10749              0
genre_10751              0
dtype: int64

In [52]:
val_x = y.drop(columns=['minhas_notas'])
val_x.loc[val_x['production_company_id'].isna(), 'production_company_id'] = 0
val_x.isna().sum()

original_language        0
popularity               0
vote_average             0
vote_count               0
runtime                  0
budget                   0
revenue                  0
production_company_id    0
year                     0
genre_10752              0
genre_12                 0
genre_14                 0
genre_16                 0
genre_18                 0
genre_10770              0
genre_27                 0
genre_28                 0
genre_10402              0
genre_35                 0
genre_36                 0
genre_37                 0
genre_9648               0
genre_53                 0
genre_80                 0
genre_99                 0
genre_878                0
genre_10749              0
genre_10751              0
dtype: int64

In [53]:
models_results = []
for i in range(0,len(dtr_models)):
    dtr_models[i].fit(test_X,test_y)
    dtr_predict = dtr_models[1].predict(val_x)
    models_results.append(dtr_predict)

for i in range(0,len(forest_models)):
    forest_models[i].fit(test_X,test_y)
    forest_predict = forest_models[i].predict(val_x)
    models_results.append(forest_predict)

In [54]:
for i in range(0,len(models_results)):
    if i <=4:
        val_x[f'new_rating_{i}'] = models_results[i]
    else:
        val_x[f'forest_rating_{i}'] = models_results[i]
    

In [55]:
result = pd.concat([tmdb_rating_df.iloc[[id]] for id in val_x.index], ignore_index=True)
for i in range(0,len(models_results)):
    if i <= 4:
        result[f'new_rating_{i}'] = val_x[f'new_rating_{i}'].values
    else:
        result[f'forest_rating_{i}'] = val_x[f'forest_rating_{i}'].values
result[['title','minhas_notas','new_rating_0','new_rating_1','new_rating_2','new_rating_3',
        'new_rating_4','forest_rating_5','forest_rating_6','forest_rating_7','forest_rating_8']]

Unnamed: 0,title,minhas_notas,new_rating_0,new_rating_1,new_rating_2,new_rating_3,new_rating_4,forest_rating_5,forest_rating_6,forest_rating_7,forest_rating_8
0,World War Z,1.5,1.5,2.5,2.5,2.5,1.49,1.535,1.51,1.69,1.845
1,Mr. Popper's Penguins,1.5,1.5,1.0,1.0,1.0,1.63,1.35,1.35,1.7,1.555
2,Avengers: Endgame,1.5,1.5,4.0,4.0,4.0,2.35,2.97,2.96,2.49,2.415
3,The Amazing Spider-Man 2,1.5,1.5,1.0,1.0,1.0,1.39,1.375,1.345,1.405,1.39
4,I Am Legend,1.5,1.5,1.0,1.0,1.0,2.24,2.325,2.285,2.18,2.3
5,Iron Man 2,1.5,1.5,2.5,2.5,2.5,1.59,1.335,1.31,1.485,1.525
6,Gonjiam: Haunted Asylum,1.5,1.5,3.5,3.5,3.5,3.19,3.555,3.58,3.21,3.1075
7,Kingsman: The Golden Circle,1.5,1.5,2.5,2.5,2.5,2.34,2.695,2.685,2.3,2.2825
8,Mulholland Drive,1.5,1.5,4.0,4.0,4.0,3.56,3.525,3.54,3.54,3.5975
9,American Beauty,1.5,1.5,3.5,3.5,3.5,3.61,3.62,3.625,3.595,3.6825


In [59]:
result[['title','minhas_notas','new_rating_1','forest_rating_5','forest_rating_6',
       'forest_rating_7','forest_rating_8']].sort_values(['forest_rating_7']).tail(10)

Unnamed: 0,title,minhas_notas,new_rating_1,forest_rating_5,forest_rating_6,forest_rating_7,forest_rating_8
11,Fear Street: 1978,1.5,2.5,2.42,2.445,2.77,2.645
15,Natural Born Killers,1.5,4.0,2.875,2.89,2.83,2.845
23,The Hangover,1.5,2.0,3.01,3.02,2.83,3.005
27,Limite,1.5,2.0,2.87,2.85,3.04,2.975
13,All the Bright Places,1.5,0.5,2.625,2.625,3.165,2.895
6,Gonjiam: Haunted Asylum,1.5,3.5,3.555,3.58,3.21,3.1075
30,The Evil Dead,1.5,5.0,3.175,3.17,3.3,3.2875
31,Shutter Island,1.5,3.0,3.685,3.68,3.535,3.5925
8,Mulholland Drive,1.5,4.0,3.525,3.54,3.54,3.5975
9,American Beauty,1.5,3.5,3.62,3.625,3.595,3.6825


In [60]:
result[['title','minhas_notas','new_rating_1','forest_rating_5','forest_rating_6',
       'forest_rating_7','forest_rating_8']].sort_values(['forest_rating_7']).head(10)

Unnamed: 0,title,minhas_notas,new_rating_1,forest_rating_5,forest_rating_6,forest_rating_7,forest_rating_8
3,The Amazing Spider-Man 2,1.5,1.0,1.375,1.345,1.405,1.39
10,The Amazing Spider-Man,1.5,1.0,1.58,1.555,1.41,1.4675
5,Iron Man 2,1.5,2.5,1.335,1.31,1.485,1.525
0,World War Z,1.5,2.5,1.535,1.51,1.69,1.845
1,Mr. Popper's Penguins,1.5,1.0,1.35,1.35,1.7,1.555
19,I Now Pronounce You Chuck & Larry,1.5,1.0,1.705,1.715,1.71,1.745
21,Night at the Museum,1.5,2.0,1.585,1.585,1.73,1.845
14,Tooth Fairy,1.5,1.0,1.49,1.49,1.755,1.735
26,G-Force,1.5,1.0,1.69,1.69,1.765,1.555
22,Norbit,1.5,1.0,1.55,1.545,1.815,1.7
