Predicting Movie Success

Import Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix , classification_report
from sklearn import svm

In [2]:
data = pd.read_csv("/content/movie_dataset capstone.csv")

In [None]:
data.tail()

In [4]:
data.drop(columns=['movie_title','plot_keywords','movie_imdb_link','facenumber_in_poster'],inplace= True)

In [None]:
data

In [None]:
data.info()

EDA

In [None]:
data.isnull().sum()

In [8]:
si = SimpleImputer(strategy='mean')
data['num_critic_for_reviews']=si.fit_transform(data[['num_critic_for_reviews']]).ravel()
data['duration']=si.fit_transform(data[['duration']]).ravel()
data['director_facebook_likes']=si.fit_transform(data[['director_facebook_likes']]).ravel()
data['actor_3_facebook_likes']=si.fit_transform(data[['actor_3_facebook_likes']]).ravel()
data['actor_1_facebook_likes']=si.fit_transform(data[['actor_1_facebook_likes']]).ravel()
data['gross']=si.fit_transform(data[['gross']]).ravel()
data['num_user_for_reviews']=si.fit_transform(data[['num_user_for_reviews']]).ravel()
data['budget']=si.fit_transform(data[['budget']]).ravel()
data['title_year']=si.fit_transform(data[['title_year']]).ravel()
data['actor_2_facebook_likes']=si.fit_transform(data[['actor_2_facebook_likes']]).ravel()
data['imdb_score']=si.fit_transform(data[['imdb_score']]).ravel()
data['aspect_ratio']=si.fit_transform(data[['aspect_ratio']]).ravel()
data['movie_facebook_likes']=si.fit_transform(data[['movie_facebook_likes']]).ravel()


In [9]:
categorical_columns = data.select_dtypes(include=['object']).columns
data = data.dropna(subset=categorical_columns)

In [None]:
data.columns

In [None]:

def classify_movie(score):
    if 1 <= score <= 3:
        return "Flop Movie"
    elif 3 < score <= 6:
        return "Average Movie"
    else:
        return "Hit Movie"


data['Classify'] = data['imdb_score'].apply(classify_movie)


In [None]:
data.tail()

In [None]:
data.isnull().sum()

In [None]:
data['Classify'].value_counts()

In [None]:
data['director_name'].value_counts()

In [None]:
data['actor_2_name'].value_counts()

In [None]:
data['genres'].value_counts()

In [None]:
data['actor_1_name'].value_counts()

In [None]:
data['actor_3_name'].value_counts()

In [None]:
data['language'].value_counts()

In [None]:
data['country'].value_counts()

In [None]:
data['content_rating'].value_counts()

In [None]:
data['actor_3_name'].value_counts()

In [None]:
data.columns

In [25]:
special_features = ['num_critic_for_reviews', 'duration', 'gross',
    'num_user_for_reviews', 'budget',
    'title_year', 'imdb_score',
    'aspect_ratio', 'movie_facebook_likes']

Visualization:

In [None]:

plt.figure(figsize=(14, 17))

plt.subplot(3, 2, 1)
data['Classify'].value_counts().plot(kind='bar', color=['red', 'orange', 'green'])
plt.title('Movie Classification Counts')
plt.xlabel('Category')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(16, 20))
plt.subplot(3, 2, 2)
plt.hist(data['imdb_score'], bins=20, color='blue', edgecolor='black')
plt.title('Distribution of IMDB Scores')
plt.xlabel('IMDB Score')
plt.ylabel('Frequency')

In [None]:
plt.figure(figsize=(6,6))
colors = data['color']
labels = colors.unique()
sizes = [colors.tolist().count(label) for label in labels]

plt.pie(
    sizes,
    labels=labels,
    autopct='%1.1f%%',
    colors=sns.color_palette("Set3"))
plt.title('Movie Color Distribution', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 20))
content_rating_avg = data.groupby('content_rating')['imdb_score'].mean().sort_values()
plt.subplot(3, 2, 3)
content_rating_avg.plot(kind='barh', color='purple')
plt.title('Average IMDB Score by Content Rating')
plt.xlabel('Average IMDB Score')
plt.ylabel('Content Rating')

In [None]:
plt.figure(figsize=(16, 20))
top_directors = data['director_name'].value_counts().head(10)
plt.subplot(4, 5, 1)
sns.barplot(x=top_directors.values, y=top_directors.index, palette="viridis")
plt.title('Top 10 Directors with Most Movies')
plt.xlabel('Number of Movies')
plt.ylabel('Director')

In [None]:
plt.figure(figsize=(16, 20))
plt.subplot(3, 2, 4)
yearly_scores = data.groupby('title_year')['imdb_score'].mean()
yearly_scores.plot(color='orange', marker='o')
plt.title('IMDB Scores Over the Years')
plt.xlabel('Year')
plt.ylabel('Average IMDB Score')

In [None]:
plt.figure(figsize=(18, 10))
sns.barplot(x='language', y='imdb_score', data=data, palette='viridis', ci=None)
plt.xticks(rotation=90)
plt.title('Average IMDB Scores by Language', fontsize=16)
plt.xlabel('Language', fontsize=14)
plt.ylabel('IMDB Score', fontsize=14)

In [None]:
plt.figure(figsize=(30, 20))
plt.subplot(3, 2, 3)
sns.scatterplot(x='cast_total_facebook_likes', y='imdb_score', data=data, alpha=0.6, hue='Classify', palette='coolwarm')
plt.title('Cast Total Facebook Likes vs IMDB Score')
plt.xlabel('Cast Total Facebook Likes')
plt.ylabel('IMDB Score')



In [None]:
plt.figure(figsize=(30, 20))
plt.subplot(3, 2, 1)
genre_scores = data.groupby('genres')['imdb_score'].mean().sort_values(ascending=False).head(10)
genre_scores.plot(kind='bar', color='teal')
plt.title('Top 10 Genres by Average IMDB Score')
plt.xlabel('Genres')
plt.ylabel('Average IMDB Score')

In [None]:
special_features_data = data[special_features]

correlation_matrix = special_features_data.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Collinearity Heatmap (Special Features Correlation Matrix)")
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
data['color'] = lb.fit_transform(data['color'])
data['Classify'] = lb.fit_transform(data['Classify'])

In [None]:
frequency_encoding = data['director_name'].value_counts().to_dict()
data['director_name'] = data['director_name'].map(frequency_encoding)

frequency_encoding = data['actor_2_name'].value_counts().to_dict()
data['actor_2_name'] = data['actor_2_name'].map(frequency_encoding)

frequency_encoding = data['actor_1_name'].value_counts().to_dict()
data['actor_1_name'] = data['actor_1_name'].map(frequency_encoding)

frequency_encoding = data['genres'].value_counts().to_dict()
data['genres'] = data['genres'].map(frequency_encoding)

frequency_encoding = data['actor_3_name'].value_counts().to_dict()
data['actor_3_name'] = data['actor_3_name'].map(frequency_encoding)

frequency_encoding = data['country'].value_counts().to_dict()
data['country'] = data['country'].map(frequency_encoding)

frequency_encoding = data['language'].value_counts().to_dict()
data['language'] = data['language'].map(frequency_encoding)

frequency_encoding = data['content_rating'].value_counts().to_dict()
data['content_rating'] = data['content_rating'].map(frequency_encoding)

In [None]:
data.head()

In [None]:
data['Classify'].value_counts()

In [None]:
data.info()

In [None]:
data['Classify'].value_counts()

Random Forest Model

In [42]:
X = data.drop(columns='Classify',axis=1)
y = data['Classify']

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)


rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
from sklearn.metrics import mean_absolute_percentage_error, r2_score

y_pred = rf.predict(X_test)


mae = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE: {mae * 100:.2f}%")


r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2 * 100 :.2f}%")

Logistics Regression Model

In [46]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [47]:

X, y = make_classification(n_samples=1000, n_features=5, n_informative=3, random_state=42)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
lr = LogisticRegression(multi_class = 'multinomial' , solver='lbfgs',max_iter=10000)
lr.fit(X_train, y_train)

In [49]:
y_pred = lr.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

In [None]:
print(classification_report(y_test, y_pred))

SVM Model

In [52]:
from sklearn import svm

In [53]:
svm_class = svm.SVC(kernel = 'linear', C=1)

In [None]:
svm_class.fit(X_train,y_train)

In [55]:
Y_pred = svm_class.predict(X_test)

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
print(classification_report(y_pred , y_test))

In [None]:
print(confusion_matrix(y_pred , y_test))