In [None]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Load the dataset
df = pd.read_csv('../data/movies.csv')

In [None]:

# Display basic information about the dataset
df.info()
df.head()

# Data Cleaning and Preparation
# Selecting important columns and handling missing values
df = df[['title', 'genres', 'budget', 'revenue', 'vote_average', 'vote_count', 'release_date']]
df.dropna(subset=['title', 'vote_average'], inplace=True)  # Removing rows without titles or vote averages
df['revenue'].fillna(0, inplace=True)
df['budget'].fillna(0, inplace=True)
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year  # Extracting release year


In [None]:

# Checking cleaned data
df.info()
df.head()


In [None]:

# Genre Analysis - Visualize the distribution of movie genres
# Split genres and count occurrences
genres = df['genres'].dropna().str.get_dummies(sep='|').sum().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=genres.values, y=genres.index)
plt.title('Distribution of Movie Genres')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.show()


In [None]:

# Revenue vs Rating Analysis - Scatter plot of revenue vs rating
plt.figure(figsize=(10, 6))
sns.scatterplot(x='vote_average', y='revenue', data=df, alpha=0.5)
plt.title('Revenue vs. Rating')
plt.xlabel('Average Rating')
plt.ylabel('Revenue')
plt.show()


In [None]:

# Top Directors by Revenue
# Assuming 'director' column is available, group by director and sum revenue
if 'director' in df.columns:
    top_directors = df.groupby('director')['revenue'].sum().sort_values(ascending=False).head(10)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_directors.values, y=top_directors.index)
    plt.title('Top 10 Directors by Revenue')
    plt.xlabel('Revenue')
    plt.ylabel('Director')
    plt.show()
else:
    print("No 'director' column available in the dataset to analyze.")

# Summary of Findings
# Print a summary of key statistics or any interesting insights from the data
summary = {
    'Average Budget': np.mean(df['budget']),
    'Average Revenue': np.mean(df['revenue']),
    'Average Rating': np.mean(df['vote_average']),
    'Top Genre': genres.idxmax()
}
summary