<a href="https://colab.research.google.com/github/Shreya7931/Data-Science-Challenge/blob/main/Pedictive_ML_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score
import pandas as pd
import numpy as np

# Load data
data = pd.read_csv('/content/p1_movie_metadata (1).csv')

# Feature engineering
# Create interaction features
data['budget_duration_interaction'] = data['budget'] * data['duration']

# Genre Frequency
genre_frequency = data['genres'].str.get_dummies(sep='|').mean()
data['genre_frequency'] = data['genres'].apply(lambda x: sum([genre_frequency[g] for g in x.split('|')]) / len(x.split('|')))

# Director's Hit Rate
director_hit_rate = data.groupby('director_name')['imdb_score'].mean().fillna(0)
data['director_hit_rate'] = data['director_name'].map(director_hit_rate)

# Lead Actor's Fame
lead_actor_fame = (data['actor_1_name'].map(data.groupby('actor_1_name')['actor_1_facebook_likes'].mean()) +
                   data['actor_2_name'].map(data.groupby('actor_2_name')['actor_2_facebook_likes'].mean())) / 2
data['lead_actor_fame'] = lead_actor_fame.fillna(0)

# Budget-to-Gross Ratio
data['budget_to_gross_ratio'] = data['budget'] / data['gross']

# Handling missing values
numerical_columns = data.select_dtypes(include=['number']).columns
categorical_columns = data.select_dtypes(exclude=['number']).columns

data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].median())
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Separate features and target variables
X = data.drop(['title_year', 'genres', 'movie_title', 'movie_imdb_link'], axis=1)
y_release_year = data['title_year']
y_genres = data['genres'].str.get_dummies(sep='|')

# Get the most probable genre for each movie
y_train_genres_labels = y_genres.idxmax(axis=1)

# Split data into train and test sets for release year prediction
X_train_release_year, X_test_release_year, y_train_release_year, y_test_release_year = train_test_split(
    X, y_release_year, test_size=0.2, random_state=42)

# Split data into train and test sets for genre prediction
X_train_genres, X_test_genres, y_train_genres_labels, y_test_genres_labels = train_test_split(
    X, y_train_genres_labels, test_size=0.2, random_state=42)

# Define numerical and categorical features
numerical_features = ['num_critic_for_reviews', 'duration', 'director_facebook_likes',
                      'actor_1_facebook_likes', 'gross', 'num_voted_users',
                      'cast_total_facebook_likes', 'facenumber_in_poster',
                      'num_user_for_reviews', 'budget', 'actor_2_facebook_likes',
                      'imdb_score', 'aspect_ratio', 'movie_facebook_likes',
                      'budget_duration_interaction', 'genre_frequency',
                      'director_hit_rate', 'lead_actor_fame', 'budget_to_gross_ratio']
categorical_features = ['color', 'language', 'country', 'content_rating']

# Preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# Preprocess the data for release year prediction
X_train_release_year_preprocessed = preprocessor.fit_transform(X_train_release_year)
X_test_release_year_preprocessed = preprocessor.transform(X_test_release_year)

# Define XGBoost regressor for release year prediction
xgb_regressor = GradientBoostingRegressor()

# Fit XGBoost regressor for release year prediction
xgb_regressor.fit(X_train_release_year_preprocessed, y_train_release_year)

# Predict release year
release_year_predictions = xgb_regressor.predict(X_test_release_year_preprocessed)
release_year_mae = mean_absolute_error(y_test_release_year, release_year_predictions)
print("Mean Absolute Error (Release Year Prediction) - XGBoost Regressor:", release_year_mae)

# Preprocess the data for genre prediction
X_train_genres_preprocessed = preprocessor.fit_transform(X_train_genres)
X_test_genres_preprocessed = preprocessor.transform(X_test_genres)

# Define XGBoost classifier for genre prediction
xgb_classifier = GradientBoostingClassifier()

# Fit XGBoost classifier for genre prediction
xgb_classifier.fit(X_train_genres_preprocessed, y_train_genres_labels)

# Predict genres
genres_predictions = xgb_classifier.predict(X_test_genres_preprocessed)

# Evaluate accuracy
genres_accuracy = accuracy_score(y_test_genres_labels, genres_predictions)
print("Accuracy (Genres Prediction) - XGBoost Classifier:", genres_accuracy)


Mean Absolute Error (Release Year Prediction) - XGBoost Regressor: 4.685647034153494
Accuracy (Genres Prediction) - XGBoost Classifier: 0.7819623389494549
