In [None]:
# 1. Import necessary libraries
import pandas as pd                # For data manipulation and analysis.
import numpy as np                 # For numerical operations.
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets.
from sklearn.linear_model import LinearRegression       # Our regression model to predict continuous values.
from sklearn.preprocessing import LabelEncoder          # To convert text (categorical) data to numbers.
from sklearn.metrics import mean_squared_error          # To measure the model's performance.

# 2. Load the dataset
 
df = pd.read_csv( r"C:\Users\Sahil\Downloads\IMDb Movies India.csv", encoding="ISO-8859-1")

# 3. Data Preprocessing

# (a) Handle missing values:
# We need the rating to be available (it's our target variable), so we remove rows missing this value.
df = df.dropna(subset=['Rating'])

# For our selected features (Genre, Director, and Actor 1), we fill missing values with "Unknown"
df['Genre'] = df['Genre'].fillna("Unknown")
df['Director'] = df['Director'].fillna("Unknown")
df['Actor 1'] = df['Actor 1'].fillna("Unknown")

# (b) Feature Engineering:
# Machine learning models work with numbers, so we convert our text-based features into numeric codes.
# We use LabelEncoder for this simple conversion.

# Create a LabelEncoder for the Genre column.
le_genre = LabelEncoder()          
df['Genre_encoded'] = le_genre.fit_transform(df['Genre'])
# Explanation: This converts each unique genre (or combination of genres) to a unique number.
# Example: "Drama" might become 0, "Comedy" might become 1, etc.

# Similarly, create a LabelEncoder for the Director column.
le_director = LabelEncoder()       
df['Director_encoded'] = le_director.fit_transform(df['Director'])
# Explanation: Each director is assigned a unique number.

# Create a LabelEncoder for the Actor 1 column.
le_actor1 = LabelEncoder()         
df['Actor1_encoded'] = le_actor1.fit_transform(df['Actor 1'])
# Explanation: Each lead actor gets a unique number.

# (c) Select Features and Target:
# For this simple model, we choose three features and the movie rating as our target.
features = ['Genre_encoded', 'Director_encoded', 'Actor1_encoded']
target = 'Rating'  # Our goal is to predict this value.

# (d) Splitting the data:
# We split the data into training (80%) and testing (20%) sets.
X = df[features]  # Features used by the model.
y = df[target]    # The target variable we want to predict.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Explanation: 'random_state=42' ensures reproducibility. The model learns from X_train and y_train, 
# and we later test its performance on X_test and y_test.

# 4. Model Building
# We initialize and train our Linear Regression model.

model = LinearRegression()       # Create an instance of LinearRegression.
model.fit(X_train, y_train)      # Train the model using the training data.
# Explanation: The model learns a relationship between the features (genre, director, actor) and the rating.

# 5. Model Evaluation
# We use the test data to evaluate how well our model predicts movie ratings.

y_pred = model.predict(X_test)   # The model predicts ratings for our test set.
mse = mean_squared_error(y_test, y_pred)  # Calculate the Mean Squared Error (MSE).
print("Mean Squared Error:", mse)
# Explanation: MSE measures the average squared difference between the predicted ratings and the actual ratings.
# A lower MSE indicates that the predictions are closer to the actual ratings.

# Optional: Compare a few actual ratings with the predicted ones.
comparison = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(comparison.head())
# Explanation: This table helps you see how well the model is performing on individual examples.
