# Importing Libraries

In [43]:
# Import Libraries

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib


  from .autonotebook import tqdm as notebook_tqdm


# Loading Data

In [39]:
# Loading the Dataset
df = pd.read_csv("..\Data\Movies.csv")

# Check columns
print(df.columns)


Index(['id', 'title', 'original_title', 'overview', 'adult',
       'original_language', 'popularity', 'release_date', 'vote_average',
       'vote_count'],
      dtype='object')


  df = pd.read_csv("..\Data\Movies.csv")


# Data Cleaning

In [40]:
# Fill missing values
df['overview'] = df['overview'].fillna('')
df['title'] = df['title'].fillna('')
df['original_language'] = df['original_language'].fillna('unknown')
df['adult'] = df['adult'].fillna(False)
df['popularity'] = df['popularity'].fillna(df['popularity'].median())

# Features and target
X = df[['overview', 'title', 'original_language', 'adult', 'popularity']]
y = df['vote_average']

# Data Splitting

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# NLP Embeddings and Preprocessing

In [44]:
# Initialize SentenceTransformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Function to embed text columns
def embed_text(df, column):
    return np.array([embedder.encode(text) for text in df[column]])

# FunctionTransformer for ColumnTransformer
overview_embed = FunctionTransformer(lambda x: embed_text(pd.DataFrame(x, columns=['overview']), 'overview'))
title_embed = FunctionTransformer(lambda x: embed_text(pd.DataFrame(x, columns=['title']), 'title'))

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('overview_emb', overview_embed, ['overview']),
    ('title_emb', title_embed, ['title']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['original_language', 'adult']),
    ('num', StandardScaler(), ['popularity'])
])


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Model Pipeline and Hyperparameter Tuning

In [45]:
# Pipeline with preprocessor + RandomForestRegressor
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Hyperparameter grid for RandomizedSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

# Randomized Search CV
search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_grid,
    n_iter=5,
    cv=3,
    scoring='r2',
    verbose=2,
    random_state=42
)

# Train the model
search.fit(X_train, y_train)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END regressor__max_depth=30, regressor__min_samples_split=10, regressor__n_estimators=300; total time=32.2min
[CV] END regressor__max_depth=30, regressor__min_samples_split=10, regressor__n_estimators=300; total time=37.5min
[CV] END regressor__max_depth=30, regressor__min_samples_split=10, regressor__n_estimators=300; total time=47.5min
[CV] END regressor__max_depth=10, regressor__min_samples_split=5, regressor__n_estimators=200; total time=15.3min
[CV] END regressor__max_depth=10, regressor__min_samples_split=5, regressor__n_estimators=200; total time=17.8min
[CV] END regressor__max_depth=10, regressor__min_samples_split=5, regressor__n_estimators=200; total time=18.6min
[CV] END regressor__max_depth=20, regressor__min_samples_split=10, regressor__n_estimators=300; total time=36.7min
[CV] END regressor__max_depth=20, regressor__min_samples_split=10, regressor__n_estimators=300; total time=41.1min
[CV] END regressor__max

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'regressor__max_depth': [None, 10, ...], 'regressor__min_samples_split': [2, 5, ...], 'regressor__n_estimators': [100, 200, ...]}"
,n_iter,5
,scoring,'r2'
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('overview_emb', ...), ('title_emb', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function <la...0028F97F211C0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function <la...0028F97F20FE0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,30
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Model Evaluation

In [46]:
best_model = search.best_estimator_

y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Best Hyperparameters:", search.best_params_)
print(f"RMSE: {rmse:.2f}")
print(f"R2 Score: {r2:.2f}")


Best Hyperparameters: {'regressor__n_estimators': 300, 'regressor__min_samples_split': 10, 'regressor__max_depth': 30}
RMSE: 1.82
R2 Score: 0.31


# Save Model with JobLib

In [48]:
import dill

with open("vote_average_predictor.pkl", "wb") as f:
    dill.dump(best_model, f)


In [49]:
with open("vote_average_predictor.pkl", "rb") as f:
    best_model = dill.load(f)