In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd 


In [None]:
data =pd.read_csv('imdb_movie_data_2023.csv')
data.head()

In [None]:
data = data.iloc[:, 1:]

In [None]:
data.info()

In [None]:
data.describe(include='all')

In [None]:
data.isnull().sum()

In [None]:
top_directors = data['Director'].value_counts()[:20]
top_directors = top_directors.reset_index(name='count')
df_top_directors = data[data['Director'].isin(top_directors['Director'])]

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=top_directors, y='Director', x='count')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.0f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(5 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Movie Count per Top Directors')

In [None]:
mean_rating_per_director = df_top_directors.groupby('Director')['Rating'].mean().reset_index(name='mean_rating')
mean_rating_per_director = mean_rating_per_director.sort_values('mean_rating', ascending = False)

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=mean_rating_per_director, y='Director', x='mean_rating')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.2f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(15 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Mean Rating per Top Directors')

In [None]:
mean_votes_per_director = df_top_directors.groupby('Director')['Votes'].mean().reset_index(name='mean_votes')
mean_votes_per_director = mean_votes_per_director.sort_values('mean_votes', ascending = False)

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=mean_votes_per_director, y='Director', x='mean_votes')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.0f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(15 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Mean Votes per Top Directors')


In [None]:
top_movies = data[data['Rating'] > 8]
top_movies = top_movies.sort_values('Rating', ascending = False)
top_movies

In [None]:
actors_dict = dict()
df_no_cast_na = data.dropna(subset=['Cast'])
cast_list = df_no_cast_na['Cast'].tolist()
for actors in cast_list:
    for actor in actors.split(','):
        actor = actor.strip()
        if actor in actors_dict:
            actors_dict[actor] += 1
        else:
            actors_dict[actor] = 1
            
top_actors_df = pd.DataFrame(list(actors_dict.items()), columns=['actor_name', 'frequency'])

top_actors_df = top_actors_df.sort_values(by='frequency', ascending=False)

top_actors_df = top_actors_df.head(20)

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=top_actors_df, y='actor_name', x='frequency')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.0f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(15 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Top Actors by Movie Count')

In [None]:
data['PG Rating'].value_counts()

In [None]:
min_year = data['Year'].min()
max_year = data['Year'].max()

sns.histplot(data=data, x='Year', bins=max_year-min_year+1)
plt.title('Movies by Year distribution')

In [None]:
sns.scatterplot(data = data, x = 'Rating', y = 'Meta Score')
plt.title('Meta Score by IMDB Rating')

In [None]:
mean_votes_per_director

In [None]:
my_favorite_directors = ['Christopher Nolan', 'Quentin Tarantino','Guy Ritchie']
data[data['Director'].isin(my_favorite_directors)].sort_values('Rating', ascending = False)