In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = 'your_dataset_path_here.csv' 
df = pd.read_csv(file_path)

In [None]:
# Check for missing values and data types
print("Dataset Info:\n", df.info())
print("Missing Values:\n", df.isnull().sum())

In [None]:
# Step 1: Data Preprocessing
# Filling missing values (if any)
df = df.fillna(method='ffill')

In [None]:
# Categorical and numerical columns
categorical_cols = ['movie_title', 'movie_info', 'critics_conse', 'rating', 'genre', 
                    'directors', 'writers', 'cast', 'studio_name']
numerical_cols = ['tomatometer', 'tomatometer_', 'tomatometer_audience_ratio', 'runtime_in_n']

# Encoding categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Step 2: Split dataset into features and target
X = df[categorical_cols + numerical_cols]
y = df['audience_rating']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Build a Pipeline
# Define column transformer for scaling numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols)
    ], remainder='passthrough')

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [None]:
# Step 4: Train the Model
pipeline.fit(X_train, y_train)

In [None]:
# Step 5: Validate the Model
# Predictions
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-Squared (R2): {r2:.2f}")


In [None]:
# Step 6: Cross-Validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R2 Scores: {cv_scores}")
print(f"Mean CV R2 Score: {np.mean(cv_scores):.2f}")
