In [1]:
#libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [3]:
# Load the dataset
file_path = '/content/drive/MyDrive/ZOHO/Rotten_Tomatoes_Movies.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [None]:
# Data Exploration
data.info()
data.head()


In [6]:
# Handle Missing Values
# Replace missing values in numerical columns with the mean
numerical_cols = data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

In [7]:
# Encode categorical columns (if any)
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [None]:
# Check the cleaned dataset
print("\nCleaned Dataset:")
print(data.head())

In [9]:
# Define Features (X) and Target (y)
if 'audience_rating' not in data.columns:
    raise ValueError("The target column 'audience_rating' is missing in the dataset!")

X = data.drop(columns=['audience_rating'])  # Drop target column
y = data['audience_rating']

In [10]:
# Split the dataset into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Create a Pipeline for Preprocessing and Model
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature Scaling
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))  # Model
])

In [None]:
# Train the Model
pipeline.fit(X_train, y_train)

In [13]:
# Make Predictions
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluate the Model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.2f}")

In [None]:
# Cross-Validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print(f"\nCross-Validation R2 Scores: {cv_scores}")
print(f"Average CV R2 Score: {np.mean(cv_scores):.2f}")

In [None]:
# Visualize Predictions vs Actual
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Audience Rating")
plt.ylabel("Predicted Audience Rating")
plt.title("Actual vs Predicted Audience Rating")
plt.show()