In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

ModuleNotFoundError: No module named 'pandas'

In [None]:

# Step 2: Load the dataset
# We use the provided 'IMDb Movies India.csv' file for our data.
try:
    df = pd.read_csv('IMDb Movies India.csv', encoding='latin-1')
except FileNotFoundError:
    print("Error: 'IMDb Movies India.csv' not found. Please make sure the file is in the correct directory.")
    exit()

print("Original DataFrame (first 5 rows):")
print(df.head())
print("-" * 50)


In [None]:
# Step 3: Data Preprocessing and Cleaning
# Real-world data often requires cleaning before it can be used for modeling.

# Drop unnecessary columns that won't be used as features.
df.drop(['Name', 'Year', 'Duration', 'Votes'], axis=1, inplace=True)

# Drop rows where 'Rating' is missing, as this is our target variable.
df.dropna(subset=['Rating'], inplace=True)

# Drop rows with any other missing feature values.
df.dropna(inplace=True)

# Define features (X) and target (y)
# We will use 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3' to predict 'Rating'.
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
target = 'Rating'

X = df[features]
y = df[target]

print(f"Dataset shape after cleaning: {df.shape}")
print("-" * 50)


In [None]:
# Step 4: Split the data into training and testing sets
# This is a crucial step to evaluate how well our model performs on unseen data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Preprocessing and Model Training Pipeline
# We'll use a pipeline to combine preprocessing and the model into a single step.

# First, define which columns are categorical. All our features are categorical.
categorical_features = features

# Create a preprocessor to apply one-hot encoding to the categorical features.
# One-hot encoding converts categorical data into a numerical format that the model can understand.
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Now, create the full pipeline with the preprocessor and a RandomForestRegressor model.
# RandomForestRegressor is a good choice for this type of regression problem.
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# Train the model using the training data.
print("Training the model...")
model.fit(X_train, y_train)
print("Model training complete.")
print("-" * 50)

In [None]:
# Step 6: Evaluate the model on the test data
print("Evaluating the model on the test set...")
y_pred = model.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")
print("-" * 50)

In [7]:


# Step 7: Make a prediction for a new, unseen movie
# Let's create a new movie data point based on the data we have.
# The model will use what it learned to predict the rating for this new movie.
new_movie = pd.DataFrame({
    'Genre': ['Action'],
    'Director': ['Rohit Shetty'],
    'Actor 1': ['Ajay Devgn'],
    'Actor 2': ['Kareena Kapoor'],
    'Actor 3': ['Arshad Warsi']
})

predicted_rating = model.predict(new_movie)
print(f"Features of the new movie: {new_movie.iloc[0].to_dict()}")
print(f"Predicted rating for the new movie: {predicted_rating[0]:.2f}")
print("-" * 50)

# This script is a complete and self-contained example.
# You can now save this file and run it directly with `python your_file_name.py`.


NameError: name 'pd' is not defined