In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset

In [None]:
data = pd.read_csv('/content/IMDb Movies India.csv', encoding='ISO-8859-1')

In [None]:
data

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

In [None]:
data.isnull().sum().sum()

In [None]:
data.shape

In [None]:
#Data Cleaning
# Extract only the year from the "Year" column
data['Year'] = data['Year'].apply(lambda x: re.search(r'\d{4}', str(x)).group(0) if re.search(r'\d{4}', str(x)) else 0)

# Extract the numeric value from the "Duration" column
data['Duration'] = data['Duration'].apply(lambda x: int(re.search(r'\d+', str(x)).group(0)) if re.search(r'\d+', str(x)) else 0)

# Remove commas and convert "Votes" to numeric
data['Votes'] = data['Votes'].str.replace(',', '', regex=True)
data['Votes'] = pd.to_numeric(data['Votes'], errors='coerce').fillna(0).astype(int)

# Encode categorical features
label_encoders = {}
categorical_features = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
for feature in categorical_features:
    data[feature] = data[feature].astype(str)  # Convert all values to strings
    label_encoders[feature] = LabelEncoder()
    data[feature] = label_encoders[feature].fit_transform(data[feature])

# Drop rows with missing "Rating" values
data.dropna(subset=['Rating'], inplace=True)


#Model

Split the data into features and target

In [None]:
X = data.drop(columns=['Rating', 'Name'])
y = data['Rating']

In [None]:
X

In [None]:
y

Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Choose a regression model

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model

In [None]:
model.fit(X_train, y_train)

Make predictions

In [None]:
y_pred = model.predict(X_test)

# Visualizations

Correlation Analysis

In [None]:
correlation_matrix = data.corr()
print(correlation_matrix)


In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

Categorical Feature Analysis

In [None]:
genre_counts = data['Genre'].value_counts()
print(genre_counts)


Feature Importance

In [None]:
feature_importance = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


Residual Analysis

In [None]:
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel('Predicted Ratings')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.axhline(0, color='red', linestyle='--')
plt.show()


Annual Movie Release Counts Over Time (Countplot for years)

In [None]:
sns.countplot(data=data, x='Year')
plt.show()

Average Movie Duration Trends Over the Years

In [None]:
sns.lineplot(x=data['Year'], y=data['Duration'])
plt.show()

Box Plot of Movie Ratings

In [None]:
plt.title('Box Plot of Movie Ratings')
sns.boxplot(x=data['Rating'])


Feature Distributions by Rating

In [None]:
# For example, compare the duration of movies for different rating categories
plt.scatter(data['Rating'], data['Duration'])
plt.xlabel('Rating')
plt.ylabel('Duration')
plt.title('Duration vs. Rating')
plt.show()


Total Votes per Rating

In [None]:
plt.title('Total Votes per Rating')
sns.barplot(x=data['Rating'], y=data['Votes'])
plt.show()

Top 20 Actors with Total Number of Movies

In [None]:
actor_counts = data['Actor 1'].append(data['Actor 2']).append(data['Actor 3'])
top_actors = actor_counts.value_counts()[:20]
sns.barplot(x=top_actors.values, y=top_actors.index)
plt.title('Top 20 Actors with Total Number of Movies')
plt.show()

Top 20 Directors by Frequency of Movies

In [None]:

top_directors = data['Director'].value_counts()[:20]
plt.title('Top 20 Directors by Frequency of Movies')
sns.barplot(x=top_directors.values, y=top_directors.index)

Plot the distribution of another numerical feature (e.g., 'Votes')

Plot the distribution of the target variable

In [None]:
plt.hist(data['Rating'], bins=20)
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings')
plt.show()

#Model Validation

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Actual vs. Predicted Ratings')
plt.grid(True)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2)
plt.show()


In [None]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Cross-Validation RMSE: {rmse_scores.mean()}')
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

