In [18]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Dataset source : [link](https://www.kaggle.com/datasets/andrezaza/clapper-massive-rotten-tomatoes-movies-and-reviews)

In [19]:
# Loading the dataset
df = pd.read_csv("../../data/movie-raw-data-csv/rotten_tomatoes_movies.csv", low_memory=False)
df.sample(3).T

Unnamed: 0,76194,58578,43910
id,in_my_room,aatish,if_looks_could_kill_2017
title,In My Room,Aatish,If Looks Could Kill
audienceScore,,62.0,11.0
tomatoMeter,80.0,,
rating,,,
ratingContents,,,
releaseDateTheaters,2019-10-11,,
releaseDateStreaming,2020-05-31,2016-11-30,2017-03-07
runtimeMinutes,119.0,150.0,90.0
genre,Drama,"Action, Crime, Drama","Mystery & thriller, Drama"


In [20]:
# The number of rows and columns in the dataset
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 143258
Number of columns: 16


In [21]:
# Checking column Dtypes, non-null counts, and size
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143258 entries, 0 to 143257
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    143258 non-null  object 
 1   title                 142891 non-null  object 
 2   audienceScore         73248 non-null   float64
 3   tomatoMeter           33877 non-null   float64
 4   rating                13991 non-null   object 
 5   ratingContents        13991 non-null   object 
 6   releaseDateTheaters   30773 non-null   object 
 7   releaseDateStreaming  79420 non-null   object 
 8   runtimeMinutes        129431 non-null  float64
 9   genre                 132175 non-null  object 
 10  originalLanguage      129400 non-null  object 
 11  director              139041 non-null  object 
 12  writer                90116 non-null   object 
 13  boxOffice             14743 non-null   object 
 14  distributor           23001 non-null   object 
 15  

In [22]:
# Displaying statistics of the dataset (Numerical columns)
df.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
audienceScore,73248.0,55.674967,24.553648,0.0,37.0,57.0,76.0,100.0
tomatoMeter,33877.0,65.770346,28.023203,0.0,45.0,73.0,89.0,100.0
runtimeMinutes,129431.0,93.708578,28.129175,1.0,84.0,92.0,103.0,2700.0


In [23]:
# Displaying statistics of the dataset (Categorical columns)
df.describe(include=['object', 'category']).T

Unnamed: 0,count,unique,top,freq
id,143258,142052,second_best_2004,2
title,142891,126403,The Stranger,17
rating,13991,10,R,7734
ratingContents,13991,8353,['Language'],365
releaseDateTheaters,30773,12062,2018-09-14,37
releaseDateStreaming,79420,4726,2017-05-22,1232
genre,132175,2912,Drama,27860
originalLanguage,129400,112,English,85034
director,139041,62206,Unknown Director,3544
writer,90116,67274,Jing Wong,48


In [24]:
# Identify number of unique movies
print(f"Number of unique movies: {df['title'].nunique()}")  

Number of unique movies: 126403


In [25]:
# Identify latest and oldest release dates (Theaters)

# Convert to datetime (invalid values become NaT)
df['releaseDateTheaters'] = pd.to_datetime(df['releaseDateTheaters'], errors='coerce')

# Now safely find oldest and latest
latest_date = df['releaseDateTheaters'].max()
oldest_date = df['releaseDateTheaters'].min()

print(f"Latest release date: {latest_date}")
print(f"Oldest release date: {oldest_date}")


Latest release date: 2032-04-21 00:00:00
Oldest release date: 1902-10-04 00:00:00


In [26]:
# Identify latest and oldest release dates (Steartming)

# Convert to datetime (invalid values become NaT)
df['releaseDateStreaming'] = pd.to_datetime(df['releaseDateStreaming'], errors='coerce')

# Now safely find oldest and latest
latest_date = df['releaseDateStreaming'].max()
oldest_date = df['releaseDateStreaming'].min()

print(f"Latest release date: {latest_date}")
print(f"Oldest release date: {oldest_date}")

Latest release date: 2023-12-22 00:00:00
Oldest release date: 1928-12-18 00:00:00
