In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor


import warnings
warnings.filterwarnings("ignore")

In [None]:
# preview the dataset
df = pd.read_csv('/Users/sourabhpandey/Desktop/ML_Assignment/data/Rotten_Tomatoes_Movies3.csv', encoding='ISO-8859-1')
df.head()

In [None]:
#get information about the dataset
df.info()

In [None]:
# get summary statistics of the datset
df.describe()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.dropna(subset=['audience_rating'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.drop('critics_consensus', axis=1, inplace=True)

In [None]:
df.dropna(inplace=True, axis=0)

In [None]:
df.shape

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/Users/sourabhpandey/Desktop/ML_Assignment/data/Rotten_Tomatoes_Movies3.csv', encoding='ISO-8859-1')

# Display the first few rows
print(df.head())

# Set the dark_background style
plt.style.use('dark_background')

# Define the numerical columns to check for outliers
numerical_columns = ['runtime_in_minutes', 'tomatometer_rating', 'audience_rating', 'tomatometer_count']

# Create box plots before removing outliers
plt.figure(figsize=(12, 8))
plt.suptitle('Box Plots Before Removing Outliers', fontsize=16)

for i, column in enumerate(numerical_columns, 1):
    plt.subplot(2, 2, i)
    df.boxplot(column=[column])
    plt.title(f'{column}')

plt.tight_layout()
plt.show()

# Calculate z-scores for numerical columns
z_scores = pd.DataFrame()
for column in numerical_columns:
    z_scores[column] = (df[column] - df[column].mean()) / df[column].std()

# Set a z-score threshold to identify outliers
z_score_threshold = 2

# Identify outliers based on z-scores
outliers = z_scores[(z_scores.abs() > z_score_threshold).any(axis=1)]

# Display the number of rows with outliers before removal
print("Number of rows with outliers before removal:", len(outliers))

# Remove outliers and create a new DataFrame
df_cleaned = df[~((z_scores.abs() > z_score_threshold).any(axis=1))]

# Create box plots after removing outliers
plt.figure(figsize=(12, 8))
plt.suptitle('Box Plots After Removing Outliers', fontsize=16)

for i, column in enumerate(numerical_columns, 1):
    plt.subplot(2, 2, i)
    df_cleaned.boxplot(column=[column])
    plt.title(f'{column}')

plt.tight_layout()
plt.show()

# Display the number of rows after removing outliers
print("Number of rows after removing outliers:", len(df_cleaned))

## Univariate analysis

In [None]:
# 1. Audience Rating Distribution
plt.figure(figsize=(8, 6))
sns.histplot(df['audience_rating'], kde=True, color='blue')
plt.title('Audience Rating Distribution')
plt.xlabel('Audience Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2. Rating vs Genre (Boxplot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='genre', y='audience_rating', data=df)
plt.title('Audience Rating vs Genre')
plt.xlabel('Genre')
plt.ylabel('Audience Rating')
plt.xticks(rotation=90)
plt.show()


In [None]:
# 4. Runtime vs Audience Rating (Scatter Plot)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='runtime_in_minutes', y='audience_rating', data=df, color='green')
plt.title('Runtime vs Audience Rating')
plt.xlabel('Runtime in Minutes')
plt.ylabel('Audience Rating')
plt.show()

In [None]:
# 5. Tomatometer Rating vs Audience Rating (Scatter Plot)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='tomatometer_rating', y='audience_rating', data=df, color='red')
plt.title('Tomatometer Rating vs Audience Rating')
plt.xlabel('Tomatometer Rating')
plt.ylabel('Audience Rating')
plt.show()

## Bivariate Analysis

In [None]:
# 1. Audience Rating vs Tomatometer Rating (Scatter Plot & Correlation)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='tomatometer_rating', y='audience_rating', data=df, color='purple')
plt.title('Audience Rating vs Tomatometer Rating')
plt.xlabel('Tomatometer Rating')
plt.ylabel('Audience Rating')
plt.show()

# Correlation between Tomatometer Rating and Audience Rating
correlation = df['tomatometer_rating'].corr(df['audience_rating'])
print(f'Correlation between Tomatometer Rating and Audience Rating: {correlation}')


In [None]:
# 3. Audience Rating vs Studio (Boxplot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='studio_name', y='audience_rating', data=df)
plt.title('Audience Rating vs Studio')
plt.xlabel('Studio')
plt.ylabel('Audience Rating')
plt.xticks(rotation=90)
plt.show()

# Print average audience rating by studio
studio_rating = df.groupby('studio_name')['audience_rating'].mean()
print("Average Audience Rating by Studio:")
print(studio_rating)


In [None]:
# 4. Audience Rating vs Runtime (Scatter Plot)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='runtime_in_minutes', y='audience_rating', data=df, color='orange')
plt.title('Audience Rating vs Runtime')
plt.xlabel('Runtime in Minutes')
plt.ylabel('Audience Rating')
plt.show()

# Print correlation between runtime and audience rating
runtime_correlation = df['runtime_in_minutes'].corr(df['audience_rating'])
print(f'Correlation between Runtime and Audience Rating: {runtime_correlation}')

## Multivariate Analysis

In [None]:
# 1. Correlation Matrix
print("Correlation Matrix:")

# Selecting only numeric columns for correlation
numeric_columns = df.select_dtypes(include=['number'])
correlation_matrix = numeric_columns.corr()

# Print the correlation matrix
print(correlation_matrix)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# 2. Pairplot
# Print selected numeric columns
print("\nPairplot Variables:")
print(numeric_columns.columns.tolist())

# Display pairplot for numeric columns
sns.pairplot(numeric_columns, diag_kind='kde', kind='scatter', palette='husl')
plt.show()

## Data Preprocessing

In [None]:
# 1. Assign the target variable (y) and independent variables (X)
target_column = 'audience_rating'  # Set the target column for prediction
X = df.drop(columns=[target_column])  # Independent variables
y = df[target_column]  # Target variable

# Print the shapes of X and y to verify
print("Shape of independent variables (X):", X.shape)
print("Shape of target variable (y):", y.shape)

In [None]:
pip install statsmodels

In [None]:
# Identify numerical columns
numerical_columns = X.select_dtypes(include=['number']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical columns
X_scaled = scaler.fit_transform(X[numerical_columns])

# Create a DataFrame for the scaled features
X_scaled_df = pd.DataFrame(X_scaled, columns=numerical_columns)

# Drop the original numerical columns from X and concatenate the scaled columns
X_final_scaled = pd.concat([X.drop(columns=numerical_columns), X_scaled_df], axis=1)

NameError: name 'StandardScaler' is not defined

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import pandas as pd
import numpy as np

# Ensure X contains only numeric columns
X_numeric = X.select_dtypes(include=['number'])

# Handle missing and infinite values
X_numeric = X_numeric.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
X_numeric = X_numeric.dropna()  # Drop rows with NaN values

# Add a constant column for statsmodels VIF calculation
X_numeric = sm.add_constant(X_numeric)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_numeric.columns
vif_data["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]

# Print the VIF values
print("Variance Inflation Factor (VIF) for each feature:")
print(vif_data)

## One-hot encode the categorical variables

In [None]:
ohe = OneHotEncoder(sparse_output=False)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Fit and transform the categorical columns
X_categorical_encoded = ohe.fit_transform(X[categorical_columns])

# Retrieve feature names for the encoded columns
feature_names = []
for i, col in enumerate(categorical_columns):
    categories = ohe.categories_[i]
    for category in categories:
        feature_names.append(f"{col}_{category}")

# Create a DataFrame for the encoded features
X_categorical_encoded_df = pd.DataFrame(X_categorical_encoded, columns=feature_names)
# Create a DataFrame for the encoded features
X_categorical_encoded_df = pd.DataFrame(X_categorical_encoded, columns=feature_names)
X_categorical_encoded_df.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Separate the independent features (X) and the target variable (y)
X = df.drop(columns=['audience_rating'])  # Drop the target variable from the feature set
y = df['audience_rating']

# Identify the numerical columns in X (excluding the target variable)
numerical_columns = X.select_dtypes(include=['number']).columns

# Instantiate MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the numerical columns in X
X_numeric_scaled = scaler.fit_transform(X[numerical_columns])

# Create a DataFrame for the scaled numerical features
X_numeric_scaled_df = pd.DataFrame(X_numeric_scaled, columns=numerical_columns)

# Display the scaled numerical feature DataFrame
print(X_numeric_scaled_df.head())

In [None]:
# combine the scaled columns and onehotencoded columns
X_final = pd.concat([X_numeric_scaled_df, X_categorical_encoded_df, ], axis=1)
X_final.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Instantiate MinMaxScaler
scaler = MinMaxScaler()

# Reshape 'Rating' to a 2D array for MinMaxScaler
y_scaled = scaler.fit_transform(df['audience_rating'].values.reshape(-1, 1))

# Convert 'y_scaled' back to a DataFrame
y = pd.DataFrame(y_scaled, columns=['audience_rating'])

# Print the scaled target variable DataFrame
print(y.head())

## Train-Test Split

In [None]:
# Split the data into training and testing data 

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42)

## Modeling
#### Baseline Model - Linear Regression

In [34]:
from sklearn.linear_model import LinearRegression
# Create a Linear Regression model
lr_model = LinearRegression()

# Fit the model on the training data
lr_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = lr_model.predict(X_test)
y_pred_train = lr_model.predict(X_train)

In [None]:
# Evaluate the model
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print(f"Test Mean Squared Error : {mse_test}")
print(f"Test R-squared (R^2) Score: {r2_test}")

print(f"Train Mean Squared Error : {mse_train}")
print(f"Train R-squared (R^2) Score: {r2_train}")

In [None]:
# Create a scatter plot for the test data
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_test, alpha=0.5, color='blue', label='Actual vs Predicted (Test Data)')
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()

## Second Model: Random Forest Model

In [None]:
# Create a Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = rf_model.predict(X_test)
y_pred_train = rf_model.predict(X_train)

In [None]:
# Evaluate the model
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print("Random Forest Regressor Model Evaluation:")
print(f"Test Mean Squared Error: {mse_test}")
print(f"Test R-squared (R^2) Score: {r2_test}")
print(f"Train Mean Squared Error: {mse_train}")
print(f"Train R-squared (R^2) Score: {r2_train}")

## Third Model: Gradient Boosting Regressor

In [None]:
# Create a Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(random_state=42)

# Fit the model on the training data
gb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = gb_model.predict(X_test)
y_pred_train = gb_model.predict(X_train)

In [None]:
# Evaluate the model
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print("Gradient Boosting Regressor Model Evaluation:")
print(f"Test Mean Squared Error: {mse_test}")
print(f"Test R-squared (R^2) Score: {r2_test}")
print(f"Train Mean Squared Error: {mse_train}")
print(f"Train R-squared (R^2) Score: {r2_train}")