# Exploratory Data Analysis Project Group 8

### Library Importation

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data Import and Clean

This section's goal is to import different data sheet and clean them to make them meet our expectation of data and be ready for merging.

mbti data sheet

In [None]:
# MBTI data import
df_mbti = pd.read_csv("./data/raw/mbti.csv")

# Drop unrelated comlumns "stat", "enneagram", "img_url"
df_mbti = df_mbti.drop(["stat","enneagram","img_url"],axis = 1)

# Drop duplicated rows
df_mbti = df_mbti.drop_duplicates()

# Check if there is any NaN value or abnormal values in mbti columns 
print(df_mbti["mbti"].unique())
print(df_mbti["mbti"].isna().any())

# Drop the rows are "XXXX" (which is not a mbti type)
df_mbti = df_mbti[df_mbti["mbti"]!= "XXXX"]

# Make sure first letters are capitalized in "role" and "movie" columns
df_mbti["role"] = df_mbti["role"].str.title()
df_mbti["movie"] = df_mbti["movie"].str.title()

# Regular expression pattern for extracting year: '(.*) \((\d{4})\)'
pattern_year = r'(.*) \((\d{4})\)'

# Extract movies with year
# Save the year into "release_year" columns
df_mbti[['movie_clean', 'release_year']] = df_mbti['movie'].str.extract(pattern_year)

# Regular expression pattern to match content in parentheses
pattern_parentheses = r'\s*\([^)]*\)'

# Remove content in parentheses
# Save movie names without parentheses into "movie_clean" columns
df_mbti['movie_clean'] = df_mbti['movie'].str.replace(pattern_parentheses, '', regex=True)

# Rename columns to have consistency in all data sheets
df_mbti.rename(columns={'role': 'character', 'movie': 'movie_name','movie_clean':'movie_name_clean'}, inplace=True)

df_mbti.head(10)

['ESFP' 'XXXX' 'ESFJ' 'ESTP' 'ESTJ' 'ENFP' 'ENFJ' 'ENTP' 'ENTJ' 'ISFP'
 'ISFJ' 'ISTP' 'ISTJ' 'INFP' 'INFJ' 'INTP' 'INTJ']
False


Unnamed: 0,mbti,character,movie_name,movie_name_clean,release_year
0,ESFP,"Peter Quill ""Star-Lord""",Marvel Cinematic Universe,Marvel Cinematic Universe,
1,ESFP,Anakin Skywalker,Star Wars,Star Wars,
2,ESFP,Ron Weasley,Harry Potter (Franchise),Harry Potter,
3,ESFP,Bellatrix Lestrange,Harry Potter (Franchise),Harry Potter,
4,ESFP,Simba,The Lion King (1994),The Lion King,1994.0
5,ESFP,Donkey,Shrek (Franchise),Shrek,
6,ESFP,"Bob Parr ""Mr. Incredible""",The Incredibles (Franchise),The Incredibles,
7,ESFP,Jack Dawson,Titanic (1997),Titanic,1997.0
8,ESFP,Aladdin,Aladdin (1992),Aladdin,1992.0
9,ESFP,Marty Mcfly,Back To The Future (Franchise),Back To The Future,


imdb_movies

In [None]:
# imdb_movies data import
df_imdb_map = pd.read_csv("./data/raw/imdb_movies.csv")

# Drop unrelated comlumns "stat", "enneagram", "img_url"
df_imdb_map = df_imdb_map.drop(["overview","crew","orig_title","status","orig_lang"],axis = 1)

# Drop duplicated rows
df_imdb_map = df_imdb_map.drop_duplicates()

# Check if there is any NaN value in "names" columns 
print(df_imdb_map["names"].isna().any())

# Make sure first letters are capitalized in "names" and "genre" columns
df_imdb_map["names"] = df_imdb_map["names"].str.title()
df_imdb_map["genre"] = df_imdb_map["genre"].str.title()

# Strip whitespace
df_imdb_map['date_x'] = df_imdb_map['date_x'].str.strip()

# Convert "date_x" column to datetime
df_imdb_map['date_x'] = pd.to_datetime(df_imdb_map['date_x'], format='%m/%d/%Y')

# Rename columns to have consistency in all data sheets
df_imdb_map.rename(columns={'names': 'movie_name', 'date_x': 'release_date','budget_x':'budget'}, inplace=True)

df_imdb_map.head(10)

False


Unnamed: 0,movie_name,release_date,score,genre,budget,revenue,country
0,Creed Iii,2023-03-02,73.0,"Drama, Action",75000000.0,271616700.0,AU
1,Avatar: The Way Of Water,2022-12-15,78.0,"Science Fiction, Adventure, Action",460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,2023-04-05,76.0,"Animation, Adventure, Family, Fantasy, Comedy",100000000.0,724459000.0,AU
3,Mummies,2023-01-05,70.0,"Animation, Comedy, Family, Adventure, Fantasy",12300000.0,34200000.0,AU
4,Supercell,2023-03-17,61.0,Action,77000000.0,340942000.0,US
5,Cocaine Bear,2023-02-23,66.0,"Thriller, Comedy, Crime",35000000.0,80000000.0,AU
6,John Wick: Chapter 4,2023-03-23,80.0,"Action, Thriller, Crime",100000000.0,351349400.0,AU
7,Puss In Boots: The Last Wish,2022-12-26,83.0,"Animation, Family, Fantasy, Adventure, Comedy",90000000.0,483480600.0,AU
8,Attack On Titan,2022-09-30,59.0,"Action, Science Fiction",71000000.0,254946500.0,US
9,The Park,2023-03-02,58.0,"Action, Drama, Horror, Science Fiction, Thriller",119200000.0,488962500.0,US


imdb_db

In [None]:
#imdb_db data import
df_db = pd.read_csv("./data/raw/imdb_db.csv")

# Drop unrelated comlumns "Number of Votes","Time Duration (min)","Director","Actors","Restriction","Description","Serie Name","Serie Date"
df_db = df_db.drop(["Number of Votes","Time Duration (min)","Director","Actors","Restriction","Description","Serie Name","Serie Date"],axis = 1)

# Drop duplicated rows
df_db = df_db.drop_duplicates()

# Check if there is any NaN value in "Movie Name" columns 
print(df_db["Movie Name"].isna().any())

# Make sure first letters are capitalized in "Movie Name" and "Movie Type" columns
df_db["Movie Name"] = df_db["Movie Name"].str.title()
df_db["Movie Type"] = df_db["Movie Type"].str.title()

# Remove square brackets in "Movie Type" column
df_db['Movie Type'] = df_db['Movie Type'].str.replace(r'[\[\]]', '', regex=True)

# Remove single quotes in "Movie Type" column
df_db['Movie Type'] = df_db['Movie Type'].str.replace(r"'", '', regex=True)

# Rename columns to have consistency in all data sheets
df_db.rename(columns={'Movie Name': 'movie_name',
                            'Movie Date': 'release_year',
                            'Movie Type':'genre',
                            'Movie Revenue (M$)':'revenue',
                            'Score':'score',
                            'Metascore':'metascore'}, inplace=True)

df_db.info()

False
<class 'pandas.core.frame.DataFrame'>
Int64Index: 130482 entries, 0 to 189895
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   movie_name    130482 non-null  object 
 1   release_year  130477 non-null  float64
 2   genre         130482 non-null  object 
 3   revenue       9082 non-null    float64
 4   score         130482 non-null  float64
 5   metascore     8672 non-null    float64
dtypes: float64(4), object(2)
memory usage: 7.0+ MB
