In [2]:
import requests
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import LabelEncoder

In [21]:
dtr_model = DecisionTreeRegressor(random_state=1)
forest_model = RandomForestRegressor(random_state=1)

encoder = LabelEncoder()

In [5]:
my_rating_df = pd.read_csv('ratings.csv')
my_rating_df.head()

Unnamed: 0,Date,Name,Year,Letterboxd URI,Rating
0,2021-11-29,Violet,2021,https://boxd.it/oMCE,4.0
1,2021-11-29,Hot Fuzz,2007,https://boxd.it/2416,3.5
2,2021-11-29,The World's End,2013,https://boxd.it/3EiO,3.0
3,2021-11-29,Scott Pilgrim vs. the World,2010,https://boxd.it/1Aq6,4.0
4,2021-11-29,La La Land,2016,https://boxd.it/a5fa,3.0


In [216]:
import json

with open("config.json","r") as file:
    config = json.load(file)

In [218]:
# TMDB API requests
API_KEY = config['TMDB_API_KEY']
# Selecionando a series contendo os nomes de filmes do meu dataframe
my_filmes_name = my_rating_df['Name']
tmdb_rating = []

# Header
headers = {
        "Authorization": f"Bearer {config['TMDB_ACCESS_TOKEN']}"
    }

In [12]:

for filme in my_filmes_name:
    search_url = f"https://api.themoviedb.org/3/search/movie?query={filme}&language=pt=BR"
    response_search = requests.get(search_url,headers=headers)
    
    if response_search.status_code == 200:
        data_search = response_search.json()
        if data_search['results']:
            # Seleciona o primeiro resultado da busca
            movie = data_search['results'][0]
            movie_id = movie['id']
            
            # Chamada para obter mais detalhes do filme
            movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=pt-BR"
            response_movie = requests.get(movie_url,headers=headers)
            
            if response_movie.status_code == 200:
                data_movie = response_movie.json()
                movie.update({key: data_movie.get(key) for key in ['runtime', 'budget', 'revenue']})
                
                production_companies = data_movie.get('production_companies',[])
                
                if production_companies:
                    # Pegando apenas o ID da produtora
                    movie['production_company_id'] = production_companies[0].get('id')
                else:
                    movie['production_company_id'] = None
            else:
                movie['runtime'] = None
                movie['budget'] = None
                movie['revenue'] = None
                
            tmdb_rating.append(movie)
        else:
            print(f"Nenhum resultado encontrado para: {filme}")
    else:
        print(f"Erro na busca pelo filme {filme}: {response_search.status_code}")

        
tmdb_rating_df = pd.DataFrame(tmdb_rating)


In [32]:
# Transformando a variável original_language em numérica
tmdb_rating_df['original_language'] = encoder.fit_transform(tmdb_rating_df['original_language'])

# Colocando a coluna minhas_notas e ano no dataframe tmdb
for i,filme in my_rating_df.iterrows():
    nome_f = filme['Name']
    nota = filme['Rating']
    ano = filme['Year']
    if nome_f in tmdb_rating_df['title'].values:
        tmdb_rating_df.loc[tmdb_rating_df['title'] == nome_f,'year'] = ano
        tmdb_rating_df.loc[tmdb_rating_df['title'] == nome_f,'minhas_notas'] = nota

#Escolhendo as variáveis de interesse
features = ['genre_ids','original_language','popularity','vote_average','vote_count',
            'runtime','budget','revenue','production_company_id','year','minhas_notas']
tmdb_feature_df = tmdb_rating_df[features]


tmdb_feature_df.head()

Unnamed: 0,genre_ids,original_language,popularity,vote_average,vote_count,runtime,budget,revenue,production_company_id,year,minhas_notas
0,[18],2,5.446,7.0,25,92,0,0,20443.0,2021.0,4.0
1,"[80, 28, 35]",2,39.681,7.561,7680,121,12000000,80600000,443.0,2007.0,3.5
2,"[35, 28, 878]",2,18.752,6.8,5466,109,20000000,46100000,443.0,2013.0,3.0
3,"[28, 35, 10749]",2,39.321,7.5,7984,113,85000000,51691156,2527.0,2010.0,4.0
4,"[35, 18, 10749, 10402]",2,76.174,7.9,17107,128,30000000,447407695,491.0,2016.0,3.0


In [33]:
tmdb_feature_df

Unnamed: 0,genre_ids,original_language,popularity,vote_average,vote_count,runtime,budget,revenue,production_company_id,year,minhas_notas
0,[18],2,5.446,7.000,25,92,0,0,20443.0,2021.0,4.0
1,"[80, 28, 35]",2,39.681,7.561,7680,121,12000000,80600000,443.0,2007.0,3.5
2,"[35, 28, 878]",2,18.752,6.800,5466,109,20000000,46100000,443.0,2013.0,3.0
3,"[28, 35, 10749]",2,39.321,7.500,7984,113,85000000,51691156,2527.0,2010.0,4.0
4,"[35, 18, 10749, 10402]",2,76.174,7.900,17107,128,30000000,447407695,491.0,2016.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...
799,"[18, 36]",2,12.177,6.500,639,132,0,0,20580.0,2021.0,1.0
800,"[18, 10749]",5,12.403,7.600,985,106,0,0,53.0,1962.0,2.0
801,"[28, 35, 12]",2,98.942,7.491,8781,132,185000000,168717425,128064.0,2021.0,2.0
802,"[18, 10402]",2,34.492,7.138,7360,111,41000000,242875078,24.0,2002.0,2.0


In [34]:
# Criando uma coluna para cada genre_id para que seja usada como uma variável numérica
all_genres_dif = set([genre for sublist in tmdb_feature_df['genre_ids'] for genre in sublist])
for genre in all_genres_dif:
    tmdb_feature_df.loc[:,f'genre_{genre}'] = tmdb_feature_df['genre_ids'].apply(lambda x: 1 if genre in x else 0)
tmdb_feature_df = tmdb_feature_df.drop(columns=['genre_ids'])
tmdb_feature_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmdb_feature_df.loc[:,f'genre_{genre}'] = tmdb_feature_df['genre_ids'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmdb_feature_df.loc[:,f'genre_{genre}'] = tmdb_feature_df['genre_ids'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

Unnamed: 0,original_language,popularity,vote_average,vote_count,runtime,budget,revenue,production_company_id,year,minhas_notas,...,genre_35,genre_36,genre_37,genre_9648,genre_53,genre_80,genre_99,genre_878,genre_10749,genre_10751
0,2,5.446,7.000,25,92,0,0,20443.0,2021.0,4.0,...,0,0,0,0,0,0,0,0,0,0
1,2,39.681,7.561,7680,121,12000000,80600000,443.0,2007.0,3.5,...,1,0,0,0,0,1,0,0,0,0
2,2,18.752,6.800,5466,109,20000000,46100000,443.0,2013.0,3.0,...,1,0,0,0,0,0,0,1,0,0
3,2,39.321,7.500,7984,113,85000000,51691156,2527.0,2010.0,4.0,...,1,0,0,0,0,0,0,0,1,0
4,2,76.174,7.900,17107,128,30000000,447407695,491.0,2016.0,3.0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799,2,12.177,6.500,639,132,0,0,20580.0,2021.0,1.0,...,0,1,0,0,0,0,0,0,0,0
800,5,12.403,7.600,985,106,0,0,53.0,1962.0,2.0,...,0,0,0,0,0,0,0,0,1,0
801,2,98.942,7.491,8781,132,185000000,168717425,128064.0,2021.0,2.0,...,1,0,0,0,0,0,0,0,0,0
802,2,34.492,7.138,7360,111,41000000,242875078,24.0,2002.0,2.0,...,0,0,0,0,0,0,0,0,0,0


In [208]:
# Uma função que cria um dataframe para a nota que queremos reavaliar e outra para o restante do data frame
def nota_alvo(df: pd.DataFrame ,rating: float):
    filmes_nota_alvo = df.loc[df['minhas_notas']==rating]
    filmes_dif_nota_alvo = df.loc[~df['minhas_notas'].isin(filmes_nota_alvo['minhas_notas'])]
    return (filmes_dif_nota_alvo,filmes_nota_alvo) 

In [209]:
X,y = nota_alvo(df=tmdb_feature_df, rating=1.5)

In [210]:
X = X.dropna()
test_X = X.drop(columns=['minhas_notas'])
test_X
test_y = X['minhas_notas']


In [211]:
dtr_model.fit(test_X,test_y)
forest_model.fit(test_X,test_y)
dtr_predict = dtr_model.predict(val_x)
forest_predict = forest_model.predict(val_x)

In [212]:
val_x = y.drop(columns=['minhas_notas'])
val_x

Unnamed: 0,original_language,popularity,vote_average,vote_count,runtime,budget,revenue,production_company_id,year,genre_10752,...,genre_35,genre_36,genre_37,genre_9648,genre_53,genre_80,genre_99,genre_878,genre_10749,genre_10751
46,2,94.738,6.819,15567,123,200000000,531865000,3281.0,2013.0,0,...,0,0,0,0,1,0,0,1,0,0
73,2,20.187,6.0,2991,94,55000000,187361754,1302.0,2011.0,0,...,1,0,0,0,0,0,0,0,0,1
76,2,133.255,8.24,26065,181,356000000,2799439100,420.0,2019.0,0,...,0,0,0,0,0,0,0,1,0,0
77,2,85.092,6.5,13403,141,200000000,708962323,5.0,2014.0,0,...,0,0,0,0,0,0,0,1,0,0
82,2,68.695,7.208,15904,101,150000000,585410052,174.0,2007.0,0,...,0,0,0,0,1,0,0,1,0,0
103,2,95.255,6.8,21221,124,200000000,623933331,420.0,2010.0,0,...,0,0,0,0,0,0,0,1,0,0
145,13,43.183,7.3,735,95,0,21041950,108532.0,2018.0,0,...,0,0,0,1,0,0,0,0,0,0
188,2,45.72,6.962,10510,141,104000000,410902662,25.0,2017.0,0,...,1,0,0,0,0,0,0,0,0,0
246,2,54.158,7.808,6397,147,15000000,20289986,694.0,2001.0,0,...,0,0,0,1,1,0,0,0,0,0
283,2,49.409,8.009,12203,122,15000000,356296601,7.0,1999.0,0,...,0,0,0,0,0,0,0,0,0,0


In [214]:
val_x['new_rating'] = dtr_predict

val_x['forest_rating'] = forest_predict

In [215]:
result = pd.concat([tmdb_rating_df.iloc[[id]] for id in val_x.index], ignore_index=True)
result['new_rating'] = val_x['new_rating'].values
result['forest_rating'] = val_x['forest_rating'].values
result[['title','minhas_notas','new_rating','forest_rating']]

Unnamed: 0,title,minhas_notas,new_rating,forest_rating
0,World War Z,1.5,1.0,1.485
1,Mr. Popper's Penguins,1.5,1.0,1.34
2,Avengers: Endgame,1.5,4.0,2.79
3,The Amazing Spider-Man 2,1.5,0.5,1.375
4,I Am Legend,1.5,4.0,2.425
5,Iron Man 2,1.5,1.0,1.315
6,Gonjiam: Haunted Asylum,1.5,3.5,2.885
7,Kingsman: The Golden Circle,1.5,2.0,2.45
8,Mulholland Drive,1.5,4.0,3.615
9,American Beauty,1.5,3.0,3.53


In [1]:
result[['title','minhas_notas','new_rating','forest_rating']].sort_values(['forest_rating'])

NameError: name 'result' is not defined