In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor

In [2]:
tmdb_df = pd.read_csv('my_movies_tmdb.csv')
tmdb_df.head()

Unnamed: 0,title,tmdb_id,tmdb_title,year,genre_ids,original_language,popularity,vote_count,vote_average,budget,revenue,runtime,production_company_id,minhas_notas
0,Violet,665901,Violet,2021,[18],en,1.025,25,7.0,0,0,92,20443.0,4.0
1,Hot Fuzz,4638,Hot Fuzz,2007,"[80, 28, 35]",en,5.328,7691,7.562,12000000,80600000,121,443.0,3.5
2,The World's End,107985,The World's End,2013,"[35, 28, 878]",en,4.319,5473,6.807,20000000,46100000,109,443.0,3.0
3,Scott Pilgrim vs. the World,22538,Scott Pilgrim vs. the World,2010,"[28, 35, 10749]",en,8.117,7993,7.491,85000000,51691156,113,2527.0,4.0
4,La La Land,313369,La La Land,2016,"[35, 18, 10749, 10402]",en,12.232,17121,7.9,30000000,509183536,129,491.0,3.0


In [3]:
# Uma função que cria um dataframe para a nota que queremos reavaliar e outra para o restante do data frame
def nota_alvo(df: pd.DataFrame ,rating: float):
    filmes_nota_alvo = df.loc[df['minhas_notas']==rating]
    filmes_dif_nota_alvo = df.loc[~df['minhas_notas'].isin(filmes_nota_alvo['minhas_notas'])]
    return (filmes_dif_nota_alvo,filmes_nota_alvo) 

In [4]:
df_outras_notas, df_nota_alvo = nota_alvo(tmdb_df,1.5)

y_outras = df_outras_notas.minhas_notas
X_outras_full = df_outras_notas.drop(['minhas_notas','tmdb_id'],axis=1)

y_alvo = df_nota_alvo.minhas_notas
X_alvo_full = df_nota_alvo.drop(['minhas_notas','tmdb_id'],axis=1)

numerical_cols = [col_name for col_name in X_outras_full.columns if X_outras_full[col_name].dtype in ['int64','float64']]
my_cols = numerical_cols + ['original_language']

X_outras = X_outras_full[my_cols].copy()
X_alvo = X_alvo_full[my_cols].copy()

In [5]:
y = tmdb_df.minhas_notas
X = tmdb_df[my_cols]
X

Unnamed: 0,year,popularity,vote_count,vote_average,budget,revenue,runtime,production_company_id,original_language
0,2021,1.025,25,7.000,0,0,92,20443.0,en
1,2007,5.328,7691,7.562,12000000,80600000,121,443.0,en
2,2013,4.319,5473,6.807,20000000,46100000,109,443.0,en
3,2010,8.117,7993,7.491,85000000,51691156,113,2527.0,en
4,2016,12.232,17121,7.900,30000000,509183536,129,491.0,en
...,...,...,...,...,...,...,...,...,...
801,2010,4.262,421,7.190,700000,0,115,6897.0,ko
802,2001,3.997,1122,6.830,19800000,87754044,137,11487.0,en
803,2019,3.558,136,5.886,0,0,97,17653.0,fr
804,2022,2.073,57,6.600,0,0,109,97367.0,vi


In [6]:
label_X_outras = X_outras.copy()
label_X_alvo = X_alvo.copy()

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
label_X_outras[['original_language']] = ordinal_encoder.fit_transform(X_outras[['original_language']])
label_X_alvo[['original_language']] = ordinal_encoder.transform(X_alvo[['original_language']])

In [7]:
company_imputer = SimpleImputer(strategy='constant',fill_value=0)

imp_X_outras = pd.DataFrame(company_imputer.fit_transform(label_X_outras))
imp_X_alvo = pd.DataFrame(company_imputer.transform(label_X_alvo))

imp_X_outras.columns = label_X_outras.columns
imp_X_alvo.columns = label_X_alvo.columns

imp_X_outras.isnull().sum()

year                     0
popularity               0
vote_count               0
vote_average             0
budget                   0
revenue                  0
runtime                  0
production_company_id    0
original_language        0
dtype: int64

In [8]:
xgb_model = XGBRegressor(n_estimators=500)
xgb_model.fit(imp_X_outras,y_outras)
predicts = xgb_model.predict(imp_X_alvo)
mean_absolute_error(predicts,y_alvo)

1.067035517655313

In [9]:
def score_model(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [10]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [11]:
score_dataset(imp_X_outras,imp_X_alvo,y_outras,y_alvo)

0.93484375

In [19]:
X_alvo_full['new_rating'] = predicts
X_alvo_full[['title','new_rating']].sort_values('new_rating')

Unnamed: 0,title,new_rating
44,World War Z,0.460715
283,The Amazing Spider-Man,0.798419
101,Iron Man 2,0.935211
75,The Amazing Spider-Man 2,1.052499
418,Norbit,1.161462
432,G-Force,1.214483
382,I Now Pronounce You Chuck & Larry,1.301921
71,Mr. Popper's Penguins,1.470833
355,Tooth Fairy,1.511153
393,Night at the Museum,1.60781


In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',company_imputer,['production_company_id']),
        ('cat',ordinal_encoder,['original_language'])
    ])

In [14]:
xgb_pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                              ('model',xgb_model)
                              ])

In [15]:
xgb_pipeline.fit(imp_X_outras,y_outras)

In [16]:
scores = -1 * cross_val_score(xgb_pipeline,X,y,
                             cv=5,
                             scoring='neg_mean_absolute_error')
scores.mean()

1.183016573243799