In [None]:
# Import necessary libraries
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the ratings data (assuming 'u.data' is available in the current directory)
ratings = pd.read_csv('u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies data (assuming 'u.item' is available in the current directory)
item_cols = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
             'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
             'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
             'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names=item_cols, encoding='ISO-8859-1')

# Merge the datasets on 'item_id'
data = pd.merge(ratings, movies[['item_id', 'movie_title'] + item_cols[5:]], on='item_id')

# Check for missing values
if data.isnull().sum().any():
    print("Warning: Missing values detected and will be filled.")
    data = data.fillna(0)

# Encode 'movie_title' using LabelEncoder
le_title = LabelEncoder()
data['movie_title_encoded'] = le_title.fit_transform(data['movie_title'])

# Save the LabelEncoder
with open('title_encoder.pkl', 'wb') as file:
    pickle.dump(le_title, file)
    
# Normalize 'user_id' and 'movie_title_encoded' to improve RandomForest performance
scaler = MinMaxScaler()
data[['user_id', 'movie_title_encoded']] = scaler.fit_transform(data[['user_id', 'movie_title_encoded']])

# Define features and target variable
features = ['user_id', 'movie_title_encoded'] + item_cols[5:]
target = 'rating'



In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data[features], data[target], test_size=0.2, random_state=42
)

# Initialize and optimize a RandomForestRegressor with GridSearchCV for better hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
model = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3)
model.fit(X_train, y_train)

# Evaluate the model
best_model = model.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Best RMSE:', rmse)
print('Best Parameters:', model.best_params_)

Best RMSE: 1.0254300123337268
Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}


In [None]:
# Save the trained model for future use
with open('optimized_movie_rating_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Load the model (for demonstration)
with open('optimized_movie_rating_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
y_pred_loaded = loaded_model.predict(X_test)
rmse_loaded = np.sqrt(mean_squared_error(y_test, y_pred_loaded))
print('RMSE of loaded model:', rmse_loaded)

RMSE of loaded model: 1.0254300123337268


: 