<a href="https://colab.research.google.com/github/PrasadKasabe/CODSOFT/blob/main/movie_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load the dataset
df = pd.read_csv('Movies_India.csv', encoding='latin-1')

# --- Data Preprocessing ---

# Drop rows where 'Rating' is missing (our target variable)
df.dropna(subset=['Rating'], inplace=True)

# Clean and convert 'Year', 'Duration', and 'Votes' columns to numeric
df['Year'] = df['Year'].str.extract(r'\((\d{4})\)').astype(float)
df['Duration'] = df['Duration'].str.replace(' min', '', regex=False).astype(float)
df['Votes'] = df['Votes'].str.replace(',', '', regex=False).astype(float)



# Impute missing 'Duration' values with the median
df['Duration'].fillna(df['Duration'].median(), inplace=True)

# Fill missing values in categorical columns with 'Unknown'
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
for col in categorical_cols:
    df[col].fillna('Unknown', inplace=True)

# Encode 'Genre' using MultiLabelBinarizer (for multiple genres per movie)
df['Genre'] = df['Genre'].apply(lambda x: str(x).split(', ') if pd.notna(x) else [])
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(df['Genre']), columns=mlb.classes_, index=df.index)
df = pd.concat([df, genre_encoded], axis=1)
df.drop('Genre', axis=1, inplace=True)

# Encode 'Director', 'Actor 1', 'Actor 2', 'Actor 3' using LabelEncoder
label_encoders = {} # Store encoders for potential future use (e.g., predicting new movies)
for col in ['Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# --- Model Building and Evaluation ---

# Define features (X) and target (y)
X = df.drop(['Name', 'Rating'], axis=1) # 'Name' is an identifier, 'Rating' is the target
y = df['Rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print("--- Model Evaluation Results ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Duration'].fillna(df['Duration'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


--- Model Evaluation Results ---
Mean Absolute Error (MAE): 0.8187
Mean Squared Error (MSE): 1.1742
Root Mean Squared Error (RMSE): 1.0836
R-squared (R2): 0.3684
