In [1]:
import pandas as pd
import boto3
import json
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

pd.set_option('display.max_columns', None)

# https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata/data?select=tmdb_5000_movies.csv

In [133]:
df = pd.read_csv('data/tmdb_5000_movies.csv')
df.loc[df['popularity'].idxmax()]



budget                                                           74000000
genres                  [{"id": 10751, "name": "Family"}, {"id": 16, "...
homepage                                     http://www.minionsmovie.com/
id                                                                 211672
keywords                [{"id": 3487, "name": "assistant"}, {"id": 179...
original_language                                                      en
original_title                                                    Minions
overview                Minions Stuart, Kevin and Bob are recruited by...
popularity                                                     875.581305
production_companies    [{"name": "Universal Pictures", "id": 33}, {"n...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2015-06-17
revenue                                                        1156730962
runtime                               

In [134]:
# remove outlier id = 211672
df = df[df['id'] != 211672]


--------------


In [135]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year.fillna(0).astype(int)

# df['budget'] = pd.to_numeric(df['budget'], errors='coerce').fillna(0)
df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce').fillna(0)
df['vote_average'] = pd.to_numeric(df['vote_average'], errors='coerce').fillna(0)
df['vote_count']   = pd.to_numeric(df['vote_count'], errors='coerce').fillna(0)

df['genres'] = df['genres'].apply(lambda x: [g['name'] for g in ast.literal_eval(x)] if pd.notnull(x) else [])
num_df = df[['runtime', 'vote_average', 'vote_count', 'release_year']]
num_df.head()

Unnamed: 0,runtime,vote_average,vote_count,release_year
0,162.0,7.2,11800,2009
1,169.0,6.9,4500,2007
2,148.0,6.3,4466,2015
3,165.0,7.6,9106,2012
4,132.0,6.1,2124,2012


In [136]:
mlb = MultiLabelBinarizer()
genre_ohe = mlb.fit_transform(df['genres'])
genre_cols = [f"genre_{g}" for g in mlb.classes_]
genre_df = pd.DataFrame(genre_ohe, columns=genre_cols, index=df.index)
genre_df.head()

Unnamed: 0,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Foreign,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [137]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
lang_ohe = ohe.fit_transform(df[['original_language']]) 
lang_cols = [f"lang_{cat}" for cat in ohe.categories_[0]]
lang_df = pd.DataFrame(lang_ohe, columns=lang_cols, index=df.index)
lang_df.head()

Unnamed: 0,lang_af,lang_ar,lang_cn,lang_cs,lang_da,lang_de,lang_el,lang_en,lang_es,lang_fa,lang_fr,lang_he,lang_hi,lang_hu,lang_id,lang_is,lang_it,lang_ja,lang_ko,lang_ky,lang_nb,lang_nl,lang_no,lang_pl,lang_ps,lang_pt,lang_ro,lang_ru,lang_sl,lang_sv,lang_ta,lang_te,lang_th,lang_tr,lang_vi,lang_xx,lang_zh
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
# Combina tudo
X = pd.concat([num_df, genre_df, lang_df], axis=1).fillna(0)
y = df['popularity'].fillna(0)
X

Unnamed: 0,runtime,vote_average,vote_count,release_year,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Foreign,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western,lang_af,lang_ar,lang_cn,lang_cs,lang_da,lang_de,lang_el,lang_en,lang_es,lang_fa,lang_fr,lang_he,lang_hi,lang_hu,lang_id,lang_is,lang_it,lang_ja,lang_ko,lang_ky,lang_nb,lang_nl,lang_no,lang_pl,lang_ps,lang_pt,lang_ro,lang_ru,lang_sl,lang_sv,lang_ta,lang_te,lang_th,lang_tr,lang_vi,lang_xx,lang_zh
0,162.0,7.2,11800,2009,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,169.0,6.9,4500,2007,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,148.0,6.3,4466,2015,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,165.0,7.6,9106,2012,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,132.0,6.1,2124,2012,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,81.0,6.6,238,1992,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4799,85.0,5.9,5,2011,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4800,120.0,7.0,6,2013,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4801,98.0,5.7,7,2012,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:

# 4) Split treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5) Pipeline de escalonamento + modelo
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42, max_depth=10))
])

# 6) Treina
pipeline.fit(X_train, y_train)

# 7) Avalia (opcional)
print("R² treino:", pipeline.score(X_train, y_train))
print("R² teste:", pipeline.score(X_test, y_test))

R² treino: 0.9580814084357554
R² teste: 0.7430361878609464


In [159]:
# Seleciona um índice aleatório do conjunto de teste
random_idx = np.random.choice(X_test.index)
X_sample = X_test.loc[random_idx].values.reshape(1, -1)
y_real = y_test.loc[random_idx]

# Faz a previsão
y_pred = pipeline.predict(X_sample)[0]

# print titlle do filme

print(f"Movie: {df.loc[random_idx, 'title']}")
print(f"Real popularity:     {y_real:.2f}")
print(f"Predicted popularity: {y_pred:.2f}")

Movie: Pirates of the Caribbean: On Stranger Tides
Real popularity:     135.41
Predicted popularity: 68.55




In [161]:
df.loc[df['popularity'].idxmax()]

budget                                                          165000000
genres                                [Adventure, Drama, Science Fiction]
homepage                                http://www.interstellarmovie.net/
id                                                                 157336
keywords                [{"id": 83, "name": "saving the world"}, {"id"...
original_language                                                      en
original_title                                               Interstellar
overview                Interstellar chronicles the adventures of a gr...
popularity                                                     724.247784
production_companies    [{"name": "Paramount Pictures", "id": 4}, {"na...
production_countries    [{"iso_3166_1": "CA", "name": "Canada"}, {"iso...
release_date                                          2014-11-05 00:00:00
revenue                                                         675120017
runtime                               

In [None]:
def predict_popularity_on_new_csv(csv_path, pipeline, mlb, ohe):
    """
    Loads a new movies.csv, preprocesses it using the same feature engineering steps,
    and predicts popularity using the trained pipeline.
    """
    # Load new data
    new_df = pd.read_csv(csv_path)
    
    # Parse release_date and extract year
    new_df['release_date'] = pd.to_datetime(new_df['release_date'], errors='coerce')
    new_df['release_year'] = new_df['release_date'].dt.year.fillna(0).astype(int)
    
    # Numeric features
    new_df['runtime'] = pd.to_numeric(new_df['runtime'], errors='coerce').fillna(0)
    new_df['vote_average'] = pd.to_numeric(new_df['vote_average'], errors='coerce').fillna(0)
    new_df['vote_count'] = pd.to_numeric(new_df['vote_count'], errors='coerce').fillna(0)
    num_df_new = new_df[['runtime', 'vote_average', 'vote_count', 'release_year']]
    
    # Genres (parse as list of names)
    new_df['genres'] = new_df['genres'].apply(lambda x: [g['name'] for g in ast.literal_eval(x)] if pd.notnull(x) and x != '' else [])
    genre_ohe_new = mlb.transform(new_df['genres'])
    genre_df_new = pd.DataFrame(genre_ohe_new, columns=[f"genre_{g}" for g in mlb.classes_], index=new_df.index)
    
    # Language one-hot
    lang_ohe_new = ohe.transform(new_df[['original_language']])
    lang_df_new = pd.DataFrame(lang_ohe_new, columns=[f"lang_{cat}" for cat in ohe.categories_[0]], index=new_df.index)
    
    # Combine features
    X_new = pd.concat([num_df_new, genre_df_new, lang_df_new], axis=1).fillna(0)
    
    # Predict
    y_pred_new = pipeline.predict(X_new)
    
    # Return predictions (optionally add to DataFrame)
    new_df['predicted_popularity'] = y_pred_new
    return new_df[['title', 'predicted_popularity']]

# Example usage:
new_predictions = predict_popularity_on_new_csv('data/new_movies.csv', pipeline, mlb, ohe)