In [2]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:

# Load the data
# data = pd.read_csv('your_data_file.csv')  # Load your dataset here

# Sample data handling from image (adapt this to your actual data loading)
data = pd.read_csv('/kaggle/input/movie-1/final_data.csv')


In [4]:
# Preprocessing steps
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return []

# Convert the list columns
list_columns = ['Genres', 'Film makers', 'Actors', 'Earliest Release region']
for col in list_columns:
    data[col] = data[col].apply(safe_literal_eval)

In [5]:
# Feature engineering
data['Num_Genres'] = data['Genres'].apply(len)
data['Num_Film_Makers'] = data['Film makers'].apply(len)
data['Num_Actors'] = data['Actors'].apply(len)
data['Num_Release_Regions'] = data['Earliest Release region'].apply(len)
data['Running Time'] = data['Running Time'].str.extract('(\d+)').astype(int)

# Convert 'Earliest Release date' to datetime and extract year, month, day
data['Earliest Release date'] = pd.to_datetime(data['Earliest Release date'], errors='coerce')
data['Release_Year'] = data['Earliest Release date'].dt.year
data['Release_Month'] = data['Earliest Release date'].dt.month
data['Release_Day'] = data['Earliest Release date'].dt.day


In [6]:
# Fill missing numeric values with 0
data.fillna(0, inplace=True)

# Define target and features
target = 'Worldwide'
features = ['Domestic Distributor', 'MPAA', 'Running Time', 'Num_Genres', 
            'Num_Film_Makers', 'Num_Actors', 'Num_Release_Regions', 
            'Release_Year', 'Release_Month', 'Release_Day']

X = data[features]
y = data[target]

In [7]:
# Preprocessing for numerical and categorical data
numeric_features = ['Running Time', 'Num_Genres', 'Num_Film_Makers', 
                    'Num_Actors', 'Num_Release_Regions', 
                    'Release_Year', 'Release_Month', 'Release_Day']

categorical_features = ['Domestic Distributor', 'MPAA']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


In [14]:



# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 39)



In [15]:

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Root Mean Squared Error: 164877991.02100205
R-squared: 0.20048221366664598
