<a href="https://colab.research.google.com/github/Prawin2005/CODSOFT-internship/blob/main/TASK_2_(Movie).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Function to load data with encoding handling
def load_data(file_path):
    encodings = ['utf-8', 'ISO-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            data = pd.read_csv(file_path, encoding=encoding)
            print(f"File loaded successfully with encoding: {encoding}")
            return data
        except UnicodeDecodeError:
            print(f"Failed to load file with encoding: {encoding}")
    raise ValueError("Failed to load file with any of the provided encodings.")

# Load the dataset
file_path = '/content/sample_data/IMDb Movies India.csv'  # Replace with your dataset file path
data = load_data(file_path)

# Display the first few rows to understand the structure
print(data.head())

# Data cleaning and type conversion
# Strip any leading/trailing whitespace characters from columns
data.columns = data.columns.str.strip()

# Extract numeric values from 'Year' and 'Duration'
data['Year'] = data['Year'].str.extract(r'(\d{4})').astype(float)
data['Duration'] = data['Duration'].str.extract(r'(\d+)').astype(float)
data['Votes'] = pd.to_numeric(data['Votes'], errors='coerce')

# Drop rows with missing values in essential columns
data.dropna(subset=['Year', 'Duration', 'Votes', 'Rating'], inplace=True)

# Features and target variable
X = data[['Year', 'Duration', 'Genre', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y = data['Rating']

# Define preprocessing steps
numeric_features = ['Year', 'Duration', 'Votes']
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Check if dataset is empty before splitting
if X.shape[0] == 0:
    raise ValueError("The dataset is empty after cleaning. Check data preprocessing.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
try:
    pipeline.fit(X_train, y_train)
except KeyboardInterrupt:
    print("Model training was interrupted. Check if the dataset is too large or if there are other issues.")

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Example predictions
print("Predictions:")
for i in range(len(X_test)):
    print(f"True Rating: {y_test.iloc[i]}, Predicted Rating: {y_pred[i]}")


Failed to load file with encoding: utf-8
File loaded successfully with encoding: ISO-8859-1
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3