# Exploratory Data Analysis Project Group 8

### Library Importation

In [194]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data Import and Clean

This section's goal is to import different data sheet and clean them to make them meet our expectation of data and be ready for merging.

mbti data sheet

In [195]:
# MBTI data import
df_mbti = pd.read_csv("./data/raw/mbti.csv")

# Drop unrelated comlumns "stat", "enneagram", "img_url"
df_mbti = df_mbti.drop(["stat","enneagram","img_url"],axis = 1)

# Drop duplicated rows
df_mbti = df_mbti.drop_duplicates()

# Check if there is any NaN value or abnormal values in mbti columns 
print(df_mbti["mbti"].unique())
print(df_mbti["mbti"].isna().any())

# Drop the rows are "XXXX" (which is not a mbti type)
df_mbti = df_mbti[df_mbti["mbti"]!= "XXXX"]

# Make sure first letters are capitalized in "role" and "movie" columns
df_mbti["role"] = df_mbti["role"].str.title()
df_mbti["movie"] = df_mbti["movie"].str.title()

# Use str.strip() to remove spaces.
df_mbti["movie"] = df_mbti["movie"].str.strip()

# Regular expression pattern for extracting year: '(.*) \((\d{4})\)'
pattern_year = r'(.*) \((\d{4})\)'

# Extract movies with year
# Save the year into "release_year" columns
df_mbti[['movie_clean', 'release_year']] = df_mbti['movie'].str.extract(pattern_year)

# Regular expression pattern to match content in parentheses
pattern_parentheses = r'\s*\([^)]*\)'

# Remove content in parentheses
# Save movie names without parentheses into "movie_clean" columns
df_mbti['movie_clean'] = df_mbti['movie'].str.replace(pattern_parentheses, '', regex=True)

# Convert "release_year" column to datetime
df_mbti['release_year'] = pd.to_datetime(df_mbti['release_year'], format = '%Y').dt.year

# Drop "movie_name" column
df_mbti.drop("movie",axis=1, inplace=True)

# Rename columns to have consistency in all data sheets
df_mbti.rename(columns={'role': 'character','movie_clean':'movie_name'}, inplace=True)

df_mbti.to_csv("./data/cleaned/df_mbti.csv")

df_mbti.info()

['ESFP' 'XXXX' 'ESFJ' 'ESTP' 'ESTJ' 'ENFP' 'ENFJ' 'ENTP' 'ENTJ' 'ISFP'
 'ISFJ' 'ISTP' 'ISTJ' 'INFP' 'INFJ' 'INTP' 'INTJ']
False
<class 'pandas.core.frame.DataFrame'>
Index: 16727 entries, 0 to 18740
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mbti          16727 non-null  object 
 1   character     16727 non-null  object 
 2   movie_name    16727 non-null  object 
 3   release_year  10939 non-null  float64
dtypes: float64(1), object(3)
memory usage: 653.4+ KB


In [196]:
df_mbti.sample(10)

Unnamed: 0,mbti,character,movie_name,release_year
2776,ESTP,Apollo Creed,Rocky,
3396,ESTP,Fred Flintstone ( 2000),The Flintstones,
17280,INTP,Teresa,Maze Runner,
7209,ENFJ,Megan Mccallister,Home Alone,1990.0
9607,ISFP,Henry Hill,Goodfellas,1990.0
7175,ENFJ,Chisolm,The Magnificent Seven,2016.0
16433,INFJ,Ho Yinsen,Marvel Cinematic Universe,
9398,ENTJ,Ivan Tretiak,The Saint,1996.0
2405,ESFJ,Niño,The Dollars Trilogy,
11071,ISFJ,Jasper Hale,Twilight,


imdb_movies

In [197]:
# imdb_movies data import
df_imdb_map = pd.read_csv("./data/raw/imdb_movies.csv")

# Drop unrelated comlumns "stat", "enneagram", "img_url"
df_imdb_map = df_imdb_map.drop(["overview","crew","orig_title","status","orig_lang"],axis = 1)

# Drop duplicated rows
df_imdb_map = df_imdb_map.drop_duplicates()

# Check if there is any NaN value in "names" columns 
print(df_imdb_map["names"].isna().any())

# Make sure first letters are capitalized in "names" and "genre" columns
df_imdb_map["names"] = df_imdb_map["names"].str.title()
df_imdb_map["genre"] = df_imdb_map["genre"].str.title()

# Strip whitespace
df_imdb_map['date_x'] = df_imdb_map['date_x'].str.strip()

# Convert "date_x" column to datetime
df_imdb_map['date_x'] = pd.to_datetime(df_imdb_map['date_x'], format='%m/%d/%Y')

# Create "release_year" column
df_imdb_map['release_year'] = df_imdb_map['date_x'].dt.year.astype('float')

# Rename columns to have consistency in all data sheets
df_imdb_map.rename(columns={'names': 'movie_name', 'date_x': 'release_date','budget_x':'budget'}, inplace=True)

df_imdb_map.to_csv("./data/cleaned/df_imdb_map.csv")

df_imdb_map.info()

False
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10178 entries, 0 to 10177
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   movie_name    10178 non-null  object        
 1   release_date  10178 non-null  datetime64[ns]
 2   score         10178 non-null  float64       
 3   genre         10093 non-null  object        
 4   budget        10178 non-null  float64       
 5   revenue       10178 non-null  float64       
 6   country       10178 non-null  object        
 7   release_year  10178 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(3)
memory usage: 636.2+ KB


In [198]:
df_imdb_map.sample(10)

Unnamed: 0,movie_name,release_date,score,genre,budget,revenue,country,release_year
4316,Michael Jackson: The Life Of An Icon,2011-11-02,78.0,Documentary,101700000.0,716719914.2,US,2011.0
4441,We Are Your Friends,2015-08-27,67.0,"Drama, Music, Romance, Comedy",2000000.0,10153415.0,AU,2015.0
2453,Monster Family 2,2022-03-03,67.0,"Animation, Family",112800000.0,356126924.8,AU,2022.0
9205,Grounded: Making The Last Of Us,2013-06-14,74.0,Documentary,79500000.0,708023892.4,US,2013.0
9649,The Catcher Was A Spy,2019-12-01,63.0,"Drama, War, Thriller",72044000.0,699828391.8,AU,2019.0
2134,Peppermint,2018-12-13,67.0,"Action, Thriller",25000000.0,51800758.0,AU,2018.0
6606,Happier Than Ever: A Love Letter To Los Angeles,2021-09-03,80.0,"Music, Documentary",124600000.0,721323777.2,AU,2021.0
4039,"I Love You, Beth Cooper",2009-07-10,59.0,"Comedy, Romance",18000000.0,16382538.0,US,2009.0
5703,Tekken: Blood Vengeance,2011-09-03,68.0,"Action, Animation, Science Fiction",172000000.0,505864462.2,JP,2011.0
7833,Sharknado,2013-09-09,39.0,"Horror, Action, Comedy, Tv Movie, Science Fict...",1000000.0,1240261.6,AU,2013.0


imdb_db

In [199]:
#imdb_db data import
df_db = pd.read_csv("./data/raw/imdb_db.csv")

# Drop unrelated comlumns "Number of Votes","Time Duration (min)","Director","Actors","Restriction","Description","Serie Name","Serie Date"
df_db = df_db.drop(["Number of Votes","Time Duration (min)","Director","Actors","Restriction","Description","Serie Name","Serie Date"],axis = 1)

# Drop duplicated rows
df_db = df_db.drop_duplicates()

# Check if there is any NaN value in "Movie Name" columns 
print(df_db["Movie Name"].isna().any())

# Make sure first letters are capitalized in "Movie Name" and "Movie Type" columns
df_db["Movie Name"] = df_db["Movie Name"].str.title()
df_db["Movie Type"] = df_db["Movie Type"].str.title()

# Remove square brackets in "Movie Type" column
df_db['Movie Type'] = df_db['Movie Type'].str.replace(r'[\[\]]', '', regex=True)

# Remove single quotes in "Movie Type" column
df_db['Movie Type'] = df_db['Movie Type'].str.replace(r"'", '', regex=True)

# Rename columns to have consistency in all data sheets
df_db.rename(columns={'Movie Name': 'movie_name',
                            'Movie Date': 'release_year',
                            'Movie Type':'genre',
                            'Movie Revenue (M$)':'revenue',
                            'Score':'score',
                            'Metascore':'metascore'}, inplace=True)

df_db.to_csv("./data/cleaned/df_db.csv")
df_db.info()

False
<class 'pandas.core.frame.DataFrame'>
Index: 130482 entries, 0 to 189895
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   movie_name    130482 non-null  object 
 1   release_year  130477 non-null  float64
 2   genre         130482 non-null  object 
 3   revenue       9082 non-null    float64
 4   score         130482 non-null  float64
 5   metascore     8672 non-null    float64
dtypes: float64(4), object(2)
memory usage: 7.0+ MB


In [200]:
df_db.sample(10)

Unnamed: 0,movie_name,release_year,genre,revenue,score,metascore
167661,Signal To Noise,2014.0,Documentary,,6.9,
75389,Missionary,2013.0,"Drama, Thriller",2120.0,5.3,48.0
22543,Twin Peaks,1990.0,"Crime, Drama, Mystery",,8.6,
123281,La Fureur Dans Le Sang,2002.0,"Crime, Drama, Mystery",,7.1,
14424,Les Bronzés,1978.0,"Comedy, Romance",,6.9,
181825,Mlb Tonight,2009.0,"News, Sport, Talk-Show",,7.6,
98075,Gorillaz: Clint Eastwood,2001.0,"Animation, Short, Fantasy",,8.0,
140806,Zhen Hun,2018.0,"Drama, Fantasy, Mystery",,7.9,
112955,Immigration Game,2017.0,"Action, Drama, Thriller",,3.3,
75300,Deadline Gallipoli,2015.0,"Drama, War",,7.3,


In [201]:
#combined_dataset = pd.merge(df_imdb_map, df_db, on=['movie_name','release_year'], how='outer')
#combined_dataset.info()

### Merge df_db and df_mbti

In [202]:
# Split df_mbti into two datasets: one where release_year is not null and one where it is null
df_mbti_with_year = df_mbti[df_mbti['release_year'].notnull()]
df_mbti_no_year = df_mbti[df_mbti['release_year'].isnull()]

# Drop release_year for merge
df_mbti_no_year.drop('release_year', inplace=True, axis=1)

# Merge the datasets based only on 'movie_name' for those without a release year
merged_no_year = pd.merge(df_mbti_no_year, df_db, on='movie_name', how='inner')

# Drop abnormal rows (there are multiple matches from the imdb_db dataset which we don't know they are correct or not)
merged_no_year.drop_duplicates(subset=['movie_name','character'],keep = 'first',inplace=True)

# Merge the datasets based on 'movie_name_clean' and 'release_year' for those with a release year
merged_with_year = pd.merge(df_mbti_with_year, df_db, on=['movie_name', 'release_year'], how='inner')

# Drop abnormal rows (there are multiple matches from the imdb_db dataset which we don't know they are correct or not)
merged_with_year.drop_duplicates(subset=['movie_name','character','release_year'],keep = 'first',inplace=True)

# Combine the two merged datasets
final_merged_df = pd.concat([merged_with_year, merged_no_year])

# Display the first few rows of the final merged dataset
final_merged_df.sort_values(['movie_name','mbti']).to_csv("./data/cleaned/df_merged.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mbti_no_year.drop('release_year', inplace=True, axis=1)


In [203]:
final_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2811 entries, 0 to 2646
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mbti          2811 non-null   object 
 1   character     2811 non-null   object 
 2   movie_name    2811 non-null   object 
 3   release_year  2811 non-null   float64
 4   genre         2811 non-null   object 
 5   revenue       384 non-null    float64
 6   score         2811 non-null   float64
 7   metascore     483 non-null    float64
dtypes: float64(4), object(4)
memory usage: 197.6+ KB


In [204]:
final_merged_df.sample(10)

Unnamed: 0,mbti,character,movie_name,release_year,genre,revenue,score,metascore
2193,ESTP,"Rebecca ""Becky"" Sharp Crawley",Vanity Fair,1932.0,"Drama, Romance",,5.7,
798,INFP,Tiny Tim Cratchit,A Christmas Carol,1938.0,"Drama, Family, Fantasy",,7.5,
497,ISFJ,"Leonide ""Leo"" Moguy",Django Unchained,2012.0,"Drama, Western",162805434.0,8.4,81.0
189,ISFJ,Tony Rydinger,The Incredibles,2004.0,"Animation, Action, Adventure",,7.4,
919,ESTP,Cadet Karen Thompson,Police Academy,1997.0,"Comedy, Crime",,5.0,
2550,ENTJ,Matthew Harrison Brady,Inherit The Wind,1999.0,Drama,,7.3,
720,ENTJ,"Victor Von Doom ""Doctor Doom""",Fantastic Four,2005.0,"Action, Adventure, Fantasy",,5.5,
927,ISTP,Cadet/Off./Sgt./Capt. Eugene Tackleberry,Police Academy,1997.0,"Comedy, Crime",,5.0,
883,ISTP,Melina Havelock,James Bond,2015.0,"Action, Comedy",,5.4,
1357,ESFP,Freddy Eynsford-Hill,My Fair Lady,2009.0,"Comedy, Romance",,6.9,
