In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading dataset
df=pd.read_csv('movies.csv')

In [None]:
df

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

# Exploratory Data Analysis (EDA)

Determine the distribution of Movie rating

In [None]:
plt.hist(df['rating'],bins=20)
plt.xlabel('ratings')
plt.ylabel('Frequency')
plt.title('Distribution of Movie Rating')
plt.show()

In [None]:
genre_counts = df['genre'].value_counts()

# The Top 5 most common genres
print(genre_counts.head(10))

In [None]:
# count the number of movies released in each year
year_counts = df['year'].value_counts()

print("The number of movies released in each year:\n", year_counts.head(10))

In [None]:
# Get the count of movies released each year
year_counts = df.groupby('year')['name'].count()

# create the line plot
sns.set(style='darkgrid')
plt.plot(year_counts.index, year_counts.values)
plt.xlabel('Year')
plt.ylabel('No.of movies released')
plt.title('No.of movies released each year')
plt.show()

In [None]:
df['year'].sort_values()
# year ranges from 1921 to 2022
# lets make group as 1921 - 1940, 1941-1960, 1961-1980, 1981-2000, 2001-2022

_21to40 = 0
_41to60 = 0
_61to80 = 0
_81to00 = 0
_01to22 = 0

for i in df['year']:
    if i>1920 and i<1940:
        _21to40+=1
    elif i>1940 and i<1960:
        _41to60+=1
    elif i>1960 and i<1980:
        _61to80+=1
    elif i>1980 and i<2000:
        _81to00+=1
    else:
        _01to22+1

data = [_21to40, _41to60, _61to80, _81to00, _01to22]
label = ['1921 - 1940', '1941-1960', '1961-1980', '1981-2000', '2001-2022']

plt.figure(figsize=(12,6))
plt.pie(data,labels=label,colors=sns.color_palette('Paired'),autopct='%.2f%%')
plt.title("Percentage of Movies based on Year")
plt.show()

# Distribution of top 5 Certificates

In [None]:
plt.figure(figsize=(14,6))
df.groupby('certificate').size().sort_values(ascending=False).head(5).plot(kind='bar', color= sns.color_palette('Set2'))
plt.xlabel('Certificate')
plt.ylabel('Frequency')
plt.title("Top 5 Certificates")
plt.show()

# Distribution of movies by rating category

In [None]:
# Count the number of movies in each rating category
rating_counts = df['certificate'].value_counts()

# Create a pie chart of the rating distribution
plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Movies by Rating Category')
plt.show()

# How does the distribution of movie ratings differ by genre?

In [None]:
# Get the top 10 genres by movie count
top_genres = df['genre'].str.split('|', expand=True).stack().value_counts().head(10).index

# Filter the dataset to only include Movies in the top 10 genres
df_top_genres = df[df['genre'].str.contains('|'.join(top_genres))]

# Create a box plot for each genre
sns.set(style='whitegrid', font_scale=1.5)
sns.catplot(x='rating', y='genre', data=df_top_genres, kind='box', height=8, aspect=1.5, order=top_genres)
plt.show()

The above Boxplot shows the distribution of movie ratings based on genre

# What are the top 10 highest-grossing movies in the dataset?

In [None]:
# Filter out the 'Not Available' box office values and sort by descending order
top_grossing = df[df['box_office'] != 'Not Available'].sort_values(by='box_office', ascending=False).head(10)

# Create a list of 10 different colors
colors = ['#F44336', '#E91E63', '#9C27B0', '#673AB7', '#3F51B5', '#2196F3', '#00BCD4', '#4CAF50', '#8BC34A', '#FFC107']

# Plot the vbar chart
plt.bar(x=top_grossing['name'], height=top_grossing['box_office'], color=colors)
plt.xticks(rotation=90)
plt.xlabel('Movie Title')
plt.ylabel('Box Office ($)')
plt.title('Top 10 Highest-Grossing Movies')
plt.show()


# Which directors have the most movies in the dataset?

In [None]:
director_count = df['directors'].value_counts()
display(director_count.head(10))

In [None]:
plt.figure(figsize=(15,5))
df.groupby('directors').size().sort_values(ascending=False).head(10).plot(kind = 'bar', color= sns.color_palette('Set3'))
plt.xlabel("Directors Name")
plt.ylabel("Number of Movies")
plt.title("Top 10 directors who directed more movies in IMDB top 250 movies")
plt.show()