In [21]:
# Mount our Google Drive
# from google.colab import drive

# drive.mount('/content/drive')

In [22]:
# Importing Necessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
df = pd.read_csv("../../data/movie-data-csv/imdb.csv")
df.head(2).T

Unnamed: 0,0,1
ID,1,2
Title,Oscar et la dame rose,Cupid
genres,drama,thriller
plot,Listening in to a conversation between his doc...,A brother and sister with a past incestuous re...
date,2009.0,1997.0


### Data Understanding

In [24]:
df.columns

Index(['ID', 'Title', 'genres', 'plot', 'date'], dtype='object')

In [25]:
df.shape

(108414, 5)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108414 entries, 0 to 108413
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      108414 non-null  int64  
 1   Title   108414 non-null  object 
 2   genres  108414 non-null  object 
 3   plot    108414 non-null  object 
 4   date    99682 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.1+ MB


In [27]:
missing_info = df.isnull().sum().to_frame(name='Null Count')
missing_info['Null Percentage (%)'] = (df.isnull().mean() * 100).round(2)
missing_info = missing_info.sort_values(by='Null Count', ascending=False)
missing_info

Unnamed: 0,Null Count,Null Percentage (%)
date,8732,8.05
ID,0,0.0
Title,0,0.0
genres,0,0.0
plot,0,0.0


In [28]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,108414.0,27104.000452,15648.28597,1.0,13552.25,27104.0,40655.75,54214.0
date,99682.0,1998.718786,22.928415,1891.0,1995.0,2008.0,2013.0,2022.0


Check the maximum and minimum length of plot description

In [29]:
# Calculate the length of each description
description_lengths = df['plot'].str.len()

max_length = description_lengths.max()
min_length = description_lengths.min()

print(f"Maximum plot description length: {max_length}")
print(f"Minimum plot description length: {min_length}")

Maximum plot description length: 10503
Minimum plot description length: 37


 Display the longest and shortest plot

In [30]:
# Get the index of the longest and shortest plot
longest_plot_index = description_lengths.idxmax()
shortest_plot_index = description_lengths.idxmin()

# Display the longest plot
print("Longest Plot:")
df.loc[longest_plot_index, 'plot']

Longest Plot:


'Guy Gabaldon died on August 31, 2006 and the world lost someone very special. During the bloody struggle for Saipan in July 1944, U.S. Marine PFC Guy Gabaldon is indeed officially credited with capturing over 1500 Japanese soldiers and civilians - singlehandedly, a record that is untouchable in the annals of American military history. For over sixty years, Guy talked about his exploits on that island, sharing his experience and using his celebrity to inspire new generations who valued bravery and bravado. However, war experience alone does not make a life, and Guy\'s didn\'t stop in 1944. He lived many different lives and most importantly he took it upon himself to help the less fortunate, particularly the wayward teenagers he encountered when he returned to the Mariana Islands in 1980, where he would live for twenty years. Guy Gabaldon grew up in East Los Angeles where he spent more time on the streets than at home. He would get into fights and he was thrown out of school at one poin

In [31]:
print("Shortest Plot:")
df.loc[shortest_plot_index, 'plot']

Shortest Plot:


'Documental sobre trabajo en el campo.'

Maximum and minimum movie released year

In [32]:
latest_movie_year = df['date'].max()
oldest_movie_year = df['date'].min()

print(f"Latest oldest year of a movie : {latest_movie_year}")
print(f"Oldest realease year a movie: {oldest_movie_year}")

Latest oldest year of a movie : 2022.0
Oldest realease year a movie: 1891.0


Unique Genres

In [33]:
df['genres'].nunique()

27

In [34]:
df['genres'].unique()

array(['drama', 'thriller', 'adult', 'documentary', 'comedy', 'crime',
       'reality-tv', 'horror', 'sport', 'animation', 'action', 'fantasy',
       'short', 'sci-fi', 'music', 'adventure', 'talk-show', 'western',
       'family', 'mystery', 'history', 'news', 'biography', 'romance',
       'game-show', 'musical', 'war'], dtype=object)

### EDA

In [35]:
df.head()

Unnamed: 0,ID,Title,genres,plot,date
0,1,Oscar et la dame rose,drama,Listening in to a conversation between his doc...,2009.0
1,2,Cupid,thriller,A brother and sister with a past incestuous re...,1997.0
2,3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fiel...,1980.0
3,4,The Secret Sin,drama,To help their unemployed father make ends meet...,1915.0
4,5,The Unrecovered,drama,The film's title refers not only to the un-rec...,2007.0


In [36]:
from collections import Counter

# Step 1: Split genres (handles multi-genre entries like "drama, thriller")
df['genres'] = df['genres'].dropna().apply(lambda x: [genre.strip() for genre in x.split(',')])
# Step 2: Flatten the list
all_genres = sum(df['genres'], [])

# Step 3: Count and create DataFrame
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=['genres', 'Count']).sort_values(by='Count', ascending=False).reset_index(drop=True)
genre_df['Percentage (%)'] = (genre_df['Count'] / genre_df['Count'].sum()) * 100

genre_df

Unnamed: 0,genres,Count,Percentage (%)
0,drama,27225,25.11207
1,documentary,26192,24.159241
2,comedy,14893,13.737156
3,short,10145,9.357648
4,horror,4408,4.065896
5,thriller,3181,2.934123
6,action,2629,2.424964
7,western,2064,1.903813
8,reality-tv,1767,1.629863
9,family,1567,1.445385
