<div align="center">
    <img src="../Logo/Ahjin_Logo-removebg-preview.png"
         alt="Ahjin Logo"
         style="width:250px; height:250px; border-radius:50%;">
</div>

In [10]:
# ------- [Import all relevant libraries] -------

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Usual Suspects
import numpy as np           # Mathematical operations
import pandas as pd          # Data manipulation

# Visualization
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

# String manipulation
import re

# Mathematical Operations
import math

# Display settings
pd.set_option('display.max_colwidth', None)
from IPython.display import display

In [11]:
# Load movie data and print it out
movie_data = pd.read_csv('../Raw Data/movies.csv')
movie_data

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [12]:
# Load ratings data and print it out
ratings_data = pd.read_csv('../Raw Data/ratings.csv')
ratings_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [13]:
# Load links data and print it out
links_data = pd.read_csv('../Raw Data/links.csv')
links_data

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [14]:
# Load tags data and print it out
tags_data = pd.read_csv('../Raw Data/tags.csv')
tags_data

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [15]:
# Merge ratings with movies to get movie info per rating
df = ratings_data.merge(movie_data, on="movieId", how="left")
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [16]:
# Drop ids and timestamp
df = df.drop(columns=['userId', 'timestamp'])
df

Unnamed: 0,movieId,rating,title,genres
0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,6,4.0,Heat (1995),Action|Crime|Thriller
3,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...
100831,166534,4.0,Split (2017),Drama|Horror|Thriller
100832,168248,5.0,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,168250,5.0,Get Out (2017),Horror
100834,168252,5.0,Logan (2017),Action|Sci-Fi


In [17]:
# ---- [Initial Data Exploration (IDE)] ----

# # Check dataset shape
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

# Check columns
print('\n'+'--'*46)
print("Columns:")
display(df.columns)

# Check metadata
print('\n'+'--'*46)
print("Metadata Check:")
display(df.info())

# Descriptive statistics
print('\n'+'--'*46)
print("Descriptive Statistics For Numeric Variables:")
display(df.describe().T)

# Categorical Variables
print('\n'+'--'*46)
print("Descriptive Statistics For Categorical Variables:")
display(df.describe(include='object').T)

The dataset has 100836 rows and 4 columns.

--------------------------------------------------------------------------------------------
Columns:


Index(['movieId', 'rating', 'title', 'genres'], dtype='object')


--------------------------------------------------------------------------------------------
Metadata Check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   movieId  100836 non-null  int64  
 1   rating   100836 non-null  float64
 2   title    100836 non-null  object 
 3   genres   100836 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 3.1+ MB


None


--------------------------------------------------------------------------------------------
Descriptive Statistics For Numeric Variables:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
movieId,100836.0,19435.295718,35530.987199,1.0,1199.0,2991.0,8122.0,193609.0
rating,100836.0,3.501557,1.042529,0.5,3.0,3.5,4.0,5.0



--------------------------------------------------------------------------------------------
Descriptive Statistics For Categorical Variables:


Unnamed: 0,count,unique,top,freq
title,100836,9719,Forrest Gump (1994),329
genres,100836,951,Comedy,7196


In [18]:
# Check and remove duplicates
print("Duplicates:", df.duplicated().sum())

# Check data completeness
print('\n'+'--'*20)
print("Missingness check:")
display(df.isna().sum())

Duplicates: 70419

----------------------------------------
Missingness check:


movieId    0
rating     0
title      0
genres     0
dtype: int64