<font size=5>
<b>Machine Learning and Data Analytics</b><br>
<b>Semester 1, 2024-2025</b><br>
<b>Module code</b>: 6COSC017C<br>
<b>Lecturer</b>: Hamid Reza Khosravani (h.r.khosravani@wiut.uz)<br>
<b>CW1</b><br>
</font>

# Import Modules

In [None]:
from typing import List, Dict, Tuple, Optional, Any, Callable
import numpy as np
import pandas as pd

from copy import deepcopy
from collections import defaultdict

from matplotlib import pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go

from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

from sklearn.feature_selection import SelectKBest, mutual_info_regression, r_regression

## Visualization

In [None]:
def plot_bar(
    df:pd.DataFrame, 
    x:str,
    y: str,    
    title: Optional[str]=None,
    figsize:Optional[Tuple]=(10, 5),
    xlabel:Optional[str]=None,
    ylabel:Optional[str]=None,
    orient:str="v",
    show_value_on_bar=False
)->plt.figure:
    """
    Plots bar chart based on the inputs
    
    Args:
        df: Dataframe containing the categorical column
        x: Categorical column associated with X axis
        y: Count column associated with Y axis
        title: Title of plot
        figsize: Tuple denoting figure size
        xlabel: String denoting label of x axis
        ylabel: String denoting label of y axis
        show_value_on_bar: Boolean indicationg wether values are shown on bars or not
        
    Returns:
        Figure containing bar chart
    """        
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)
    sns.barplot(data=df, x=x, y=y, ax=ax, orient=orient)
    ax.set_title(title)
    ax.tick_params(axis='x', rotation=90)
    if show_value_on_bar:
        ax.bar_label(ax.containers[0])
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    return fig


# Load Data

In [None]:
playstore_df = pd.read_csv("./googleplaystore.csv")
print("Dataset Shape:", playstore_df.shape)  # Rows and Columns
print("\nDataset Head:")
display(playstore_df.head())  # First 5 rows of the dataset

# Check basic information about dataset(data type)

In [None]:
playstore_df.info()

In [None]:
# If you pay attention here:
# we had 10841 entries and there, some columns have less entries for example, 
# Rating has 9367 values which means there are missing values in this column. And also, Android Ver,
#Type, Current Ver, Content Rating. 
# We have 3 columns which are expected to be numeric,but here they are object. They are Size, Installs, Price

# Get descriptive statistics of data

In [None]:
# Numerical column statistics
print(playstore_df.describe())

# Categorical column statistics
print(playstore_df.describe(include=['O']))

In [None]:
# We can see that we have only one numeric column: Rating and other are categorical columns
# Installs, Size, Price, Reviews will be converted to numeric

## Unique Values Count

In [None]:
# Count unique values for each column
def print_unique_values(playstore_df):
    for column in playstore_df.columns:
        unique_values = playstore_df[column].nunique()
        print(f"Column '{column}' has {unique_values} unique values")

print_unique_values(playstore_df)

## Convert columns to numeric(Installs, Price, Size)

### Start with Installs

In [None]:
# first of all we will see the unique values in the installs column
playstore_df.Installs.unique()

In [None]:
playstore_df.Installs.value_counts() # value counts is use to count the number of unique values in the column
# got this code from : https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html 

In [None]:
# Here we need to remove + and , comma signs to convert it to numerical column. 
#And we can change Free to 0 as long as it might indicate no installs.
playstore_df['Installs'] = playstore_df['Installs'].replace('Free', '0')

In [None]:
# let's check the missing values in column
playstore_df.Installs.isnull().sum()

In [None]:
# Removing plus + sign from Installs column
playstore_df['Installs'] = playstore_df['Installs'].apply(lambda x:x.replace('+','') if '+' in str(x) else x)
# Removing comma , from the  Installs column
playstore_df['Installs'] = playstore_df['Installs'].apply(lambda x:x.replace(',',''))
playstore_df.Installs.head()

In [None]:
# Now we can convert it to numerical value
# Converting Installs column into numeric column
playstore_df['Installs'] = playstore_df['Installs'].apply(lambda x: int(x))
playstore_df.Installs.dtypes

## Size Column Conversion

In [None]:
# let's see first unique values in Size 
playstore_df.Size.unique()

In [None]:
# Here we have values in :
# Values with M
# Values with k
# Values with "Varies with device"

In [None]:
# Before going to convert it let's check if there is any missing values in Size column
playstore_df.Size.isnull().sum()

In [None]:
def convert_size(Size):
    if isinstance(Size, str):
        if 'k' in Size:
            return float(Size.replace('k', '')) / 1024  # Convert KB to MB
        elif 'M' in Size:
            return float(Size.replace('M', ''))  # Already in MB
        elif 'Varies with device' in Size:
            return np.nan  # Assign NaN for 'Varies with device'
    return Size
playstore_df['Size'] = playstore_df['Size'].apply(convert_size)
playstore_df.head()

In [None]:
playstore_df.Size.value_counts()

In [None]:
playstore_df['Size'] = pd.to_numeric(playstore_df['Size'], errors='coerce')

## Price Column numeric conversion

In [None]:
# first of all, usually we need to check the unique values 
playstore_df.Price.unique()

In [None]:
playstore_df.Price.value_counts()

In [None]:
# We need to remove only dollar sign.

In [None]:
playstore_df['Price'] = playstore_df['Price'].replace('Everyone', '0')
# Now we are going to remove dollar sign from the column 
playstore_df['Price'] = playstore_df['Price'].apply(lambda x: x.replace('$','') if '$' in str(x) else x)
# converting Price column into numeric column
playstore_df['Price'] = playstore_df['Price'].apply(lambda x: float(x))

In [None]:
playstore_df.Price.value_counts()

## Reviews Column numeric conversion

In [None]:
playstore_df[~playstore_df.Reviews.str.isnumeric()]

In [None]:
playstore_df=playstore_df.drop(playstore_df.index[10472])

In [None]:
playstore_df["Reviews"] = playstore_df["Reviews"].astype(int)
playstore_df.info()

## Last Updated Column conversion

In [None]:
playstore_df['Last Updated'] = pd.to_datetime(playstore_df['Last Updated'])
playstore_df['Last Updated']

In [None]:
## create 2 different columns for last updated month and year and drop existing last updated column

In [None]:
playstore_df['Updated_Month']=playstore_df['Last Updated'].dt.month
playstore_df['Updated_Year']=playstore_df['Last Updated'].dt.year
playstore_df.drop('Last Updated', axis=1, inplace=True)
playstore_df.head()

## Basic Statistics after conversion

In [None]:
playstore_df.info()

## Check missing values and visualize

In [None]:
print("\nMissing Values:")
missing_values = playstore_df.isnull().sum()
print(missing_values)

# Visualize missing values
plt.figure(figsize=(12, 8))
sns.heatmap(playstore_df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.show()

## Handle missing values before EDA and data preparation

### Fill missing values in Rating with median

In [None]:
rating_median = playstore_df['Rating'].median()
playstore_df['Rating'] = playstore_df['Rating'].fillna(rating_median)

### Fill missing values in Type with mode

In [None]:
type_mode = playstore_df['Type'].mode()[0]
playstore_df['Type'] = playstore_df['Type'].fillna(type_mode)

### Fill missing values in Content Rating with mode

In [None]:
content_rating_mode = playstore_df['Content Rating'].mode()[0]
playstore_df['Content Rating'] = playstore_df['Content Rating'].fillna(content_rating_mode)

### Replace missing values in 'Current Ver' with a placeholder 'Unknown'

In [None]:
playstore_df['Current Ver'] = playstore_df['Current Ver'].fillna('Unknown')

### Replace missing values in 'Android Ver' with a placeholder 'Varies with device'

In [None]:
playstore_df['Android Ver'] = playstore_df['Android Ver'].fillna('Varies with device')

### Replace missing values in 'Size' with a placeholder median

In [None]:
playstore_df['Size'] = playstore_df['Size'].fillna(playstore_df['Size'].median())

### Check dataset that no missing values left

In [None]:
print("\nMissing Values After Imputation:")
print(playstore_df.isnull().sum())

## check and remove duplicates in dataset

In [None]:
duplicate = playstore_df.duplicated()
print(duplicate.sum())

In [None]:
playstore_df.drop_duplicates(inplace=True)
duplicate = playstore_df.duplicated()
print(duplicate.sum())

In [None]:
## Extract numerical and categorical features before splitting the data

In [None]:
num_features=[col for col in playstore_df.columns if playstore_df[col].dtype!='O']
num_features

In [None]:
cat_features=[col for col in playstore_df.columns if playstore_df[col].dtype=='O']
cat_features

## Outliers. Detection and Handling

In [None]:
# Detecting and Handling Outliers
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Columns to check for outliers
numeric_columns = ['Rating', 'Reviews', 'Installs', 'Size']

for col in numeric_columns:
    print(f"\nDetecting outliers in '{col}':")
    outliers, lower_bound, upper_bound = detect_outliers_iqr(playstore_df, col)
    print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
    print(f"Number of Outliers: {len(outliers)}")

    # Visualize outliers using boxplot
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=playstore_df, x=col, hue=None)  # Removed 'palette' for compatibility
    plt.title(f"Outliers in {col}")
    plt.show()

# Handle outliers: Cap values outside the bounds
for col in numeric_columns:
    _, lower_bound, upper_bound = detect_outliers_iqr(playstore_df, col)
    playstore_df[col] = np.clip(playstore_df[col], lower_bound, upper_bound)

### Check dataset after handling outliers

In [None]:
# Verify outliers have been handled
print("\nAfter handling outliers:")
for col in numeric_columns:
    print(f"{col}: Min = {playstore_df[col].min()}, Max = {playstore_df[col].max()}")

## Feature Engineering

In [None]:
features_removal = ['App', 'Genres', 'Current Ver', 'Android Ver']

## Data Splitting

In [None]:
target = 'Rating'
X = playstore_df.copy().drop(features_removal+[target], axis=1)
y = playstore_df.copy()[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [None]:
# Label encoding

In [None]:
le_dict = defaultdict()
features_to_encode = X_train.select_dtypes(include=['category', 'object']).columns

for col in features_to_encode:
    le = LabelEncoder()

    X_train[col] = le.fit_transform(X_train[col]) # Fitting and tranforming the Train data
    X_train[col] = X_train[col].astype('category') # Converting the label encoded features from numerical back to categorical dtype in pandas

    X_test[col] = le.transform(X_test[col]) # Only transforming the test data
    X_test[col] = X_test[col].astype('category') # Converting the label encoded features from numerical back to categorical dtype in pandas

    le_dict[col] = le # Saving the label encoder for individual features

## Normalization and Scaling

In [None]:
# Converting and adding "Last Updated Month" to categorical features
categorical_features = features_to_encode + ['Updated_Month']
X_train['Updated_Month'] = X_train['Updated_Month'].astype('category')
X_test['Updated_Month'] = X_test['Updated_Month'].astype('category')

# Listing numeric features to scale
numeric_features = X_train.select_dtypes(exclude=['category', 'object']).columns

In [None]:
numeric_features

In [None]:
scaler = StandardScaler()

# Fitting and transforming the Training data
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
# X_train = scaler.fit_transform(X_train)

# Only transforming the Test data
X_test[numeric_features] = scaler.transform(X_test[numeric_features])
# X_test = scaler.transform(X_test)

## Numerical feature analysis for some columns

In [None]:
# Numerical Feature Analysis
numerical_features = ['Rating', 'Reviews', 'Size', 'Installs']

# Descriptive Statistics
print("\nDescriptive Statistics for Numerical Features:")
display(playstore_df[numerical_features].describe())

# Visualizations: Histograms and Boxplots
for feature in numerical_features:
    plt.figure(figsize=(12, 6))
    
    # Histogram
    plt.subplot(1, 2, 1)
    plt.hist(playstore_df[feature].dropna(), bins=20, color='skyblue', edgecolor='black')
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    
    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=feature, data=playstore_df, color='lightblue')
    plt.title(f"Boxplot of {feature}")
    plt.ylabel(feature)
    
    plt.tight_layout()
    plt.show()

## Categorical Analysis(Type)

In [None]:
# Categorical Analysis: Distribution of App Type
playstore_df['Type'].value_counts().plot.pie(
    autopct='%1.1f%%', startangle=90, figsize=(8, 8), colors=['skyblue', 'lightcoral']
)
plt.title('Distribution of App Type (Free vs Paid)')
plt.ylabel('')
plt.show()


## Correlation Matrix(Relationship between variables)

In [None]:
# Filter numeric columns for correlation
numeric_columns = playstore_df.select_dtypes(include=['number']).columns

# Correlation Matrix
correlation_matrix = playstore_df[numeric_columns].corr()

# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Pairplot for numerical variables

In [None]:
# Ensure only numeric columns are selected
numerical_cols = ['Rating', 'Size', 'Installs', 'Reviews']
pairplot_data = playstore_df[numerical_cols].select_dtypes(include=['number'])

# Plot pairplot
sns.pairplot(pairplot_data, kind='scatter', plot_kws={'alpha': 0.6})
plt.suptitle('Pairplot of Numerical Variables', fontsize=16)
plt.tight_layout()
plt.show()

## Exploring Relationships(install by category, etc)

### Rating for App Types

In [None]:
# Boxplot of Ratings for Free vs Paid Apps
plt.figure(figsize=(8, 6))
sns.boxplot(x='Type', y='Rating', data=playstore_df, color='lightblue')  # Use color for a single color
plt.title('Rating Distribution by App Type')
plt.xlabel('App Type')
plt.ylabel('Rating')
plt.show()

## Average Rating by Genres

In [None]:
# Average Rating by Genres
rating_by_genres = playstore_df.groupby('Genres')['Rating'].mean().sort_values(ascending=False).head(10)
rating_by_genres.plot(kind='barh', color='mediumaquamarine', figsize=(10, 6), edgecolor='black')
plt.title('Top 10 Genres by Average Rating')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.gca().invert_yaxis()
plt.show()

## Splitting the data for training and testing

In [None]:
print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}")

# Model Training

In [None]:
import matplotlib.pyplot as plt

# Function to plot actual vs predicted values
def plot_real_pred(y_real, y_pred, title="Actual vs Predicted Values"):
    plt.figure(figsize=(10, 6))
    plt.scatter(y_real, y_pred, alpha=0.7, color='blue')
    plt.plot([y_real.min(), y_real.max()], [y_real.min(), y_real.max()], 'r--', lw=2)  # Add a diagonal line for perfect predictions
    plt.title(title)
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.show()

In [None]:
# Initialize a DataFrame to store metrics
models = ['Linear', 'Tuned SGD', 'SGD with Feature Selection']
metrics = ['train_R2', 'test_R2', 'train_MAE', 'test_MAE', 'train_RMSE', 'test_RMSE']
df_metrics_reg = pd.DataFrame(index=models, columns=metrics)

## Model 1 - Linear Regression

In [None]:
# Scenario 1: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

### Prediction

In [None]:
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)

### Metrics for Linear regression

In [None]:
# Store metrics for Linear Regression
df_metrics_reg.loc['Linear', 'train_R2'] = r2_score(y_train, y_train_pred_lr)
df_metrics_reg.loc['Linear', 'test_R2'] = r2_score(y_test, y_test_pred_lr)

df_metrics_reg.loc['Linear', 'train_MAE'] = mean_absolute_error(y_train, y_train_pred_lr)
df_metrics_reg.loc['Linear', 'test_MAE'] = mean_absolute_error(y_test, y_test_pred_lr)

# Calculate RMSE manually using np.sqrt
df_metrics_reg.loc['Linear', 'train_RMSE'] = np.sqrt(mean_squared_error(y_train, y_train_pred_lr))
df_metrics_reg.loc['Linear', 'test_RMSE'] = np.sqrt(mean_squared_error(y_test, y_test_pred_lr))

### Scenario 2 - Hyperparameter tuning

In [None]:
param_grid = {
    'penalty': ['l2', 'elasticnet'],  # Regularization techniques
    'alpha': [0.0001, 0.001],  # Regularization strength
    'max_iter': [1000, 5000]  # Iterations for convergence
}

# Use K-Fold Cross-Validation
cv = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(estimator=SGDRegressor(random_state=1), param_grid=param_grid, cv=cv, scoring="neg_mean_squared_error")

grid_search.fit(X_train, y_train)

In [None]:
# Get the best tuned model
best_sgd_model = deepcopy(grid_search.best_estimator_)
y_train_pred_sgd = best_sgd_model.predict(X_train)
y_test_pred_sgd = best_sgd_model.predict(X_test)

# Store metrics for Tuned SGD Regressor
df_metrics_reg.loc['Tuned SGD', 'train_R2'] = r2_score(y_train, y_train_pred_sgd)
df_metrics_reg.loc['Tuned SGD', 'test_R2'] = r2_score(y_test, y_test_pred_sgd)

df_metrics_reg.loc['Tuned SGD', 'train_MAE'] = mean_absolute_error(y_train, y_train_pred_sgd)
df_metrics_reg.loc['Tuned SGD', 'test_MAE'] = mean_absolute_error(y_test, y_test_pred_sgd)

# Calculate RMSE using np.sqrt
df_metrics_reg.loc['Tuned SGD', 'train_RMSE'] = np.sqrt(mean_squared_error(y_train, y_train_pred_sgd))
df_metrics_reg.loc['Tuned SGD', 'test_RMSE'] = np.sqrt(mean_squared_error(y_test, y_test_pred_sgd))

### Scenario 3 -  Feature Selection(SGD Regressor)

In [None]:
# Adjust k to be <= number of features in the dataset (here, 9 features)
feature_selector = SelectKBest(r_regression, k=9)  # Select top 9 features
feature_selector.fit(X_train, y_train)

X_train_selected = feature_selector.transform(X_train)
X_test_selected = feature_selector.transform(X_test)

# Perform grid search on the selected features
grid_search_selected = GridSearchCV(estimator=SGDRegressor(random_state=1), param_grid=param_grid, cv=cv, scoring="neg_mean_squared_error")
grid_search_selected.fit(X_train_selected, y_train)

# Get the best SGD model from grid search with feature selection
selected_sgd_model = deepcopy(grid_search_selected.best_estimator_)

# Predict on selected features
y_train_pred_fs = selected_sgd_model.predict(X_train_selected)
y_test_pred_fs = selected_sgd_model.predict(X_test_selected)

### Metrics for SGD

In [None]:
# Store metrics for SGD with Feature Selection
df_metrics_reg.loc['SGD with Feature Selection', 'train_R2'] = r2_score(y_train, y_train_pred_fs)
df_metrics_reg.loc['SGD with Feature Selection', 'test_R2'] = r2_score(y_test, y_test_pred_fs)

df_metrics_reg.loc['SGD with Feature Selection', 'train_MAE'] = mean_absolute_error(y_train, y_train_pred_fs)
df_metrics_reg.loc['SGD with Feature Selection', 'test_MAE'] = mean_absolute_error(y_test, y_test_pred_fs)

# Calculate RMSE using np.sqrt
df_metrics_reg.loc['SGD with Feature Selection', 'train_RMSE'] = np.sqrt(mean_squared_error(y_train, y_train_pred_fs))
df_metrics_reg.loc['SGD with Feature Selection', 'test_RMSE'] = np.sqrt(mean_squared_error(y_test, y_test_pred_fs))

### Visualize model evaluation metrics

In [None]:
# Display Model Performance Metrics
print("\nModel Performance Metrics:")
display(df_metrics_reg)

# Visualization for the final model (SGD with Feature Selection)
plot_real_pred(y_test, y_test_pred_fs, title="SGD with Feature Selection: Actual vs Predicted")

## Model 2 - Random Forest Regression

### Train the model

In [None]:
# Initialize the Random Forest Regressor with default parameters
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

### Prediction

In [None]:
# Make predictions on the test set
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

### Evaluate the model

In [None]:
train_r2_rf = r2_score(y_train, y_train_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

train_mae_rf = mean_absolute_error(y_train, y_train_pred_rf)
test_mae_rf = mean_absolute_error(y_test, y_test_pred_rf)

train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))

### Display evaluation metrics

In [None]:
# Display evaluation metrics
print(f"Random Forest Regressor - Training Metrics:")
print(f"Train R2: {train_r2_rf:.4f}")
print(f"Test R2: {test_r2_rf:.4f}")
print(f"Train MAE: {train_mae_rf:.4f}")
print(f"Test MAE: {test_mae_rf:.4f}")
print(f"Train RMSE: {train_rmse_rf:.4f}")
print(f"Test RMSE: {test_rmse_rf:.4f}")

### Define important features for further analysis

In [None]:
# Get feature importance from the trained model
importances = rf_model.feature_importances_

# Create a DataFrame with feature names and their importance values
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display the most important features
print("Feature Importance:")
display(feature_importance_df)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance - Random Forest')
plt.show()

### Hyperparameter tuning

In [None]:
# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    "criterion": ["squared_error", "absolute_error", "friedman_mse"],
    "n_estimators": randint(100, 200),  # Range of number of trees
    "max_depth": [None, 10, 20],  # Range of tree depth
    "min_samples_split": randint(2, 6),  # Minimum number of samples to split node
    "min_samples_leaf": randint(1, 5),  # Minimum number of samples per leaf
    "max_features": ['sqrt', 'log2', None]  # Max features per split
}

# Initialize RandomizedSearchCV
grid_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    cv=3,  # Cross-validation folds
    n_iter=20,  # Number of iterations
    n_jobs=-1,  # Use all available cores
    verbose=1  # Show progress
)

# Fit the RandomizedSearchCV
grid_search.fit(X_train, y_train)

# Best parameters from RandomizedSearchCV
print(f"Best Parameters from RandomizedSearchCV: {grid_search.best_params_}")


In [None]:
# After tuning, I decided to train the best model found by grid search - AI usage

In [None]:
# Train the best model found by the grid search
best_rf_model = grid_search.best_estimator_

# Make predictions with the tuned model
y_train_pred_rf_tuned = best_rf_model.predict(X_train)
y_test_pred_rf_tuned = best_rf_model.predict(X_test)

### Evaluate tuned model

In [None]:
train_r2_rf_tuned = r2_score(y_train, y_train_pred_rf_tuned)
test_r2_rf_tuned = r2_score(y_test, y_test_pred_rf_tuned)

train_mae_rf_tuned = mean_absolute_error(y_train, y_train_pred_rf_tuned)
test_mae_rf_tuned = mean_absolute_error(y_test, y_test_pred_rf_tuned)

train_rmse_rf_tuned = np.sqrt(mean_squared_error(y_train, y_train_pred_rf_tuned))
test_rmse_rf_tuned = np.sqrt(mean_squared_error(y_test, y_test_pred_rf_tuned))

### Display evaluation metrics

In [None]:
# Display evaluation metrics for the tuned model
print(f"\nRandom Forest Regressor (Tuned) - Training Metrics:")
print(f"Train R2: {train_r2_rf_tuned:.4f}")
print(f"Test R2: {test_r2_rf_tuned:.4f}")
print(f"Train MAE: {train_mae_rf_tuned:.4f}")
print(f"Test MAE: {test_mae_rf_tuned:.4f}")
print(f"Train RMSE: {train_rmse_rf_tuned:.4f}")
print(f"Test RMSE: {test_rmse_rf_tuned:.4f}")

In [None]:
# Feature importance for the tuned model
importances_tuned = best_rf_model.feature_importances_

# Create a DataFrame for feature importance
feature_importance_df_tuned = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances_tuned
}).sort_values(by='Importance', ascending=False)

# Display the most important features
print("Feature Importance (Tuned):")
display(feature_importance_df_tuned)

# Plot feature importances for the tuned model
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df_tuned)
plt.title('Feature Importance - Tuned Random Forest')
plt.show()

## Model 3 - XGBoost regressor

### Label encoding

In [None]:
# Apply Label Encoding for categorical features
categorical_features = ['Category', 'Type', 'Content Rating', 'Updated_Month']

# Initialize LabelEncoder
le = LabelEncoder()

# Apply label encoding to each categorical feature
for col in categorical_features:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [None]:
# Define the parameter grid for RandomizedSearchCV
param_dist = {
    "n_estimators": randint(100, 500),  # Number of boosting rounds (trees)
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # Step size
    "max_depth": randint(3, 15),  # Depth of each tree
    "min_child_weight": randint(1, 10),  # Minimum sum of instance weight in a child
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of samples
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of features per tree
    "gamma": [0, 0.1, 0.2, 0.3, 0.4],  # Minimum loss reduction
    "scale_pos_weight": [1, 2, 3],  # For imbalanced data
}

### Model training

In [None]:
# Initialize the XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of iterations
    cv=5,  # Cross-validation folds
    scoring='neg_mean_squared_error',  # Use MSE for evaluation
    n_jobs=-1,  # Use all processors
    verbose=1,  # Show search progress
    random_state=42
)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

In [None]:
# Get best parameter from random search 

In [None]:
# Get the best parameters from the random search
best_params = random_search.best_params_
print("Best Parameters:", best_params)


### Prediction

In [None]:
y_train_pred_xgb = random_search.best_estimator_.predict(X_train)
y_test_pred_xgb = random_search.best_estimator_.predict(X_test)

### Evaluate the model

In [None]:
train_r2_xgb = r2_score(y_train, y_train_pred_xgb)
test_r2_xgb = r2_score(y_test, y_test_pred_xgb)

train_mae_xgb = mean_absolute_error(y_train, y_train_pred_xgb)
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_xgb)

train_rmse_xgb = np.sqrt(mean_squared_error(y_train, y_train_pred_xgb))
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))

In [None]:
# Evaluation metrics

In [None]:
# Print evaluation metrics
print(f"XGBoost Regressor - Training Metrics (after tuning):")
print(f"Train R2: {train_r2_xgb:.4f}")
print(f"Test R2: {test_r2_xgb:.4f}")
print(f"Train MAE: {train_mae_xgb:.4f}")
print(f"Test MAE: {test_mae_xgb:.4f}")
print(f"Train RMSE: {train_rmse_xgb:.4f}")
print(f"Test RMSE: {test_rmse_xgb:.4f}")

### Important features

In [None]:
# Feature importance visualization
importances = random_search.best_estimator_.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display the most important features
print("Feature Importance:")
display(feature_importance_df)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance - XGBoost (after tuning)')
plt.show()

## Compare models

In [None]:
# Apply Label Encoding for categorical features
categorical_features = ['Category', 'Type', 'Content Rating', 'Updated_Month']

# Initialize LabelEncoder
le = LabelEncoder()

# Apply label encoding to each categorical feature in both train and test sets
for col in categorical_features:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

# Convert data to numeric (in case there are any non-numeric columns after encoding)
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Initialize models
lr = LinearRegression()
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)

# Define a function to calculate metrics with cross-validation
def evaluate_model_with_cv(model, X, y, cv_folds=5):
    # Cross-validation for R2, MAE, RMSE
    cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

    # Compute cross-validation scores for R2
    r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
    
    # Compute cross-validation scores for MAE
    mae_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    
    # Compute cross-validation scores for RMSE
    rmse_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_root_mean_squared_error')

    # Average scores
    avg_r2 = np.mean(r2_scores)
    avg_mae = -np.mean(mae_scores)  # Negative because neg_mean_absolute_error is used
    avg_rmse = -np.mean(rmse_scores)  # Negative because neg_root_mean_squared_error is used

    return avg_r2, avg_mae, avg_rmse

# Store results in a DataFrame
models = ['Linear Regression', 'Random Forest', 'XGBoost']
results = []

# Run cross-validation for each model
for model in [lr, rf, xgb]:
    avg_r2, avg_mae, avg_rmse = evaluate_model_with_cv(model, X_train, y_train, cv_folds=5)
    results.append([model.__class__.__name__, avg_r2, avg_mae, avg_rmse])

# Convert to DataFrame for better readability
comparison_df = pd.DataFrame(results, columns=['Model', 'Avg Train R2', 'Avg Train MAE', 'Avg Train RMSE'])

# Display results
print("Model Comparison with Cross-Validation:")
display(comparison_df)
