In [3]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Class to store and save the model details along with evaluation metrics
class ModelObject:
    def __init__(self, model_name, model, params, best_params, evaluation_metrics, version):
        self.model_name = model_name
        self.model = model
        self.params = params
        self.best_params = best_params
        self.evaluation_metrics = evaluation_metrics
        self.version = version

    def log_details(self):
        log_message = f"Model: {self.model_name} (Version: {self.version})\n"
        log_message += f"Initial Parameters: {self.params}\n"
        log_message += f"Best Parameters after tuning: {self.best_params}\n"
        log_message += f"Evaluation Metrics: {self.evaluation_metrics}\n"
        return log_message

    def save(self, save_path):
        joblib.dump(self, save_path)
        print(f"Model saved at: {save_path}")

# Base Class for Dataset Handling
class Dataset:
    def __init__(self, num_users=1000, num_movies=500):
        self.num_users = num_users
        self.num_movies = num_movies
        self.data = None
        self.features = None
        self.target = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def generate_synthetic_data(self):
        print("Generating Synthetic Data...")
        # Generate Users
        user_ids = [f'u_{i}' for i in range(1, self.num_users + 1)]
        ages = np.random.randint(18, 70, size=self.num_users)
        genders = np.random.choice(['Male', 'Female', 'Other'], size=self.num_users, p=[0.48, 0.48, 0.04])

        users = pd.DataFrame({
            'user_id': user_ids,
            'age': ages,
            'gender': genders
        })

        # Generate Movies
        movie_ids = [f'm_{i}' for i in range(1, self.num_movies + 1)]
        genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Romance', 'Sci-Fi', 'Documentary']
        movie_genres = np.random.choice(genres, size=self.num_movies)
        release_years = np.random.randint(1980, 2023, size=self.num_movies)

        movies = pd.DataFrame({
            'movie_id': movie_ids,
            'genre': movie_genres,
            'release_year': release_years
        })

        # Generate Ratings
        ratings = []
        for _ in range(self.num_users * 5):  # Each user rates ~5 movies on average
            user = users.sample(1).iloc[0]
            movie = movies.sample(1).iloc[0]

            # Base rating influenced by genre preference
            genre_preference = {
                'Action': 3,
                'Comedy': 3,
                'Drama': 3,
                'Horror': 2,
                'Romance': 2,
                'Sci-Fi': 3,
                'Documentary': 2
            }

            base_rating = genre_preference.get(movie['genre'], 3)

            # Adjust rating based on user's age
            if user['age'] < 25:
                if movie['genre'] in ['Action', 'Sci-Fi', 'Comedy']:
                    base_rating += 1
                elif movie['genre'] in ['Horror', 'Romance']:
                    base_rating -= 1
            elif user['age'] > 50:
                if movie['genre'] in ['Drama', 'Documentary', 'Romance']:
                    base_rating += 1
                elif movie['genre'] in ['Action', 'Sci-Fi']:
                    base_rating -= 1

            # Adjust rating based on user's gender
            if user['gender'] == 'Female':
                if movie['genre'] in ['Romance', 'Drama']:
                    base_rating += 1
                elif movie['genre'] in ['Horror', 'Action']:
                    base_rating -= 1
            elif user['gender'] == 'Male':
                if movie['genre'] in ['Action', 'Sci-Fi']:
                    base_rating += 1
                elif movie['genre'] in ['Romance', 'Documentary']:
                    base_rating -= 1

            # Add some noise
            rating = base_rating + np.random.normal(0, 0.5)
            rating = min(max(rating, 1), 5)  # Ratings between 1 and 5

            ratings.append({
                'user_id': user['user_id'],
                'movie_id': movie['movie_id'],
                'rating': round(rating, 1)
            })

        self.data = pd.DataFrame(ratings)
        print("Synthetic Data Generation Completed.")

    def preprocess(self):
        print("Preprocessing Data...")
        df = self.data.copy()

        # Check if 'gender' column exists
        print("Columns before encoding:", df.columns.tolist())
        
        # One-Hot Encode Categorical Features if they exist
        if 'gender' in df.columns:
            df = pd.get_dummies(df, columns=['gender'], drop_first=True)
        else:
            print("'gender' column not found in DataFrame.")

        # One-Hot Encode user_id and movie_id
        df = pd.get_dummies(df, columns=['user_id', 'movie_id'], drop_first=True)

        # Define features and target
        self.features = df.drop('rating', axis=1)
        self.target = df['rating']

        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.features, self.target, test_size=0.2, random_state=42
        )
        print("Data Preprocessing Completed.")

# Base Class for Model Selection and Tuning
class ModelSelector:
    def __init__(self):
        self.models = {
            'RandomForest': RandomForestRegressor(),
            'KNN': KNeighborsRegressor()
        }
        self.best_model_object = None
        self.version = 1  # Versioning starts at 1

    def hyperparameter_tuning(self, model, param_grid, X_train, y_train):
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_, grid_search.best_params_

    def select_model(self, X_train, y_train, X_test, y_test):
        # Define parameter grids for each model
        param_grids = {
            'RandomForest': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 7]},
            'KNN': {'n_neighbors': [3, 5, 10]}
        }

        best_score = float('inf')  # Initialize with infinity for regression
        for model_name, model in self.models.items():
            print(f"Tuning {model_name}...")
            tuned_model, best_params = self.hyperparameter_tuning(model, param_grids[model_name], X_train, y_train)
            
            # Evaluate on test data
            y_pred = tuned_model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)

            print(f"{model_name} Test RMSE: {rmse}")

            # Save model object only if it is the best one
            if rmse < best_score:
                best_score = rmse
                self.best_model_object = ModelObject(
                    model_name=model_name,
                    model=tuned_model,
                    params=param_grids[model_name],
                    best_params=best_params,
                    evaluation_metrics={"rmse": rmse},
                    version=self.version
                )

        print(f"Best Model: {self.best_model_object.model_name} with RMSE: {self.best_model_object.evaluation_metrics['rmse']}")
        return self.best_model_object

    def save_best_model(self):
        if self.best_model_object:
            # Create the model's versioned file name
            save_path = f"models/{self.best_model_object.model_name}_v{self.version}.pkl"
            self.best_model_object.save(save_path)
            self.version += 1  # Increment the version for the next save

# Main AutoML Pipeline
class AutoMLPipeline:
    def __init__(self):
        self.dataset = Dataset()
        self.model_selector = ModelSelector()

    def run(self):
        # Generate and preprocess data
        print("Generating and Preprocessing Data...")
        self.dataset.generate_synthetic_data()
        self.dataset.preprocess()

        # Model Selection and Evaluation
        print("Selecting the best model...")
        best_model = self.model_selector.select_model(
            self.dataset.X_train, self.dataset.y_train, 
            self.dataset.X_test, self.dataset.y_test
        )

        # Save the best model with versioning
        self.model_selector.save_best_model()

# Run the AutoML pipeline
if __name__ == "__main__":
    # Create the models directory if it doesn't 
    # Create the models directory if it doesn't exist
    if not os.path.exists('models'):
        os.makedirs('models')

    # Execute the AutoML Pipeline
    pipeline = AutoMLPipeline()
    pipeline.run()


Generating and Preprocessing Data...
Generating Synthetic Data...
Synthetic Data Generation Completed.
Preprocessing Data...
Columns before encoding: ['user_id', 'movie_id', 'rating']
'gender' column not found in DataFrame.
Data Preprocessing Completed.
Selecting the best model...
Tuning RandomForest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
RandomForest Test RMSE: 1.1417078882461078
Tuning KNN...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
KNN Test RMSE: 0.9841388113472611
Best Model: KNN with RMSE: 0.9841388113472611
Model saved at: models/KNN_v1.pkl
