In [7]:
# 把流程濃縮一下（最常見的 join key 清理順序）

# 檢查欄位是否存在（有沒有缺失、是不是該有的 key）
# 檢查唯一性（該唯一的要唯一，不唯一的要理解為什麼）
# 檢查 dtype 一致性（全部轉 str 或 int，避免 join 出錯）
# 檢查 join 對應比例（有多少能對上？有多少對不上？）
# 正式 join，並檢查 join 後的筆數

In [None]:
import pandas as pd
import numpy as np
import ast

ratings = pd.read_csv("row_data/MovieLens 20M Dataset/rating.csv", usecols=["userId", "movieId", "rating", "timestamp"])
movies  = pd.read_csv("row_data/MovieLens 20M Dataset/movie.csv", usecols=["movieId", "title", "genres"])
links   = pd.read_csv("row_data/MovieLens 20M Dataset/link.csv", usecols=["movieId", "imdbId", "tmdbId"])

metadata = pd.read_csv(
    "row_data/The Movies Dataset/movies_metadata.csv",
    low_memory=False,
    usecols=["id", "title", "genres", "overview", "release_date", "runtime", "original_language"]
)
credits  = pd.read_csv("row_data/The Movies Dataset/credits.csv")   # columns: id, title, cast, crew
keywords = pd.read_csv("row_data/The Movies Dataset/keywords.csv")  # columns: id, keywords

# low_memory=False 的作用
# 意思是：不要為了省記憶體而分塊推斷 dtype。
# 改成 一次讀完整個檔案，再統一推斷 dtype。
# 好處：避免 dtype 判斷錯誤、避免很多 DtypeWarning。
# 壞處：可能會用比較多記憶體（因為要一次讀完整檔案來判斷）。

In [32]:
# 1.1. 檢查空值
metadata["id"].isna().sum() # 檢查出來是0

metadata.info() # 但也能這樣直接檢查，看看你要找的欄位跟其他欄位，有沒有Count數字差異

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genres             45466 non-null  object 
 1   id                 45466 non-null  object 
 2   original_language  45455 non-null  object 
 3   overview           44512 non-null  object 
 4   release_date       45379 non-null  object 
 5   runtime            45203 non-null  float64
 6   title              45460 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.4+ MB


In [36]:
# 1.2.1 檢查重複
metadata['id'].duplicated(keep=False)  # 單純這樣會是一個 mask
metadata['id'].duplicated(keep=False).sum()  # 59 筆重複

dup_ids = metadata[metadata["id"].duplicated(keep=False)].sort_values("id").head(5)
dup_ids
# keep=False 這是多個重複都會顯示
# keep="first"（預設）：除了第一次出現以外，其餘重複值標記為 True
# keep="last"：除了最後一次出現以外，其餘重複值標記為 True

Unnamed: 0,genres,id,original_language,overview,release_date,runtime,title
676,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",105045,de,"East-Berlin, 1961, shortly after the erection ...",1995-02-16,115.0,The Promise
1465,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",105045,de,"East-Berlin, 1961, shortly after the erection ...",1995-02-16,115.0,The Promise
44821,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",10991,ja,When Molly Hale's sadness of her father's disa...,2000-07-08,93.0,Pokémon: Spell of the Unknown
4114,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",10991,ja,When Molly Hale's sadness of her father's disa...,2000-07-08,93.0,Pokémon: Spell of the Unknown
5710,"[{'id': 18, 'name': 'Drama'}]",109962,en,Two literary women compete for 20 years: one w...,1981-09-23,115.0,Rich and Famous


In [None]:
# 1.2.2 丟掉重複
unique_ids_df = metadata.drop_duplicates(subset=["id"], keep="first")

len(metadata) # 45466
len(unique_ids_df)  # 45436

45436

In [None]:
# 1.3.1 檢查data type，找出奇怪資料 --開始洗資料建立表單

mask = ~unique_ids_df['id'].astype(str).str.isdigit()
rr = unique_ids_df[mask]["id"]

# 下面的更推薦，預計是什麼，其他全部不是的都 coerce
id_numeric = pd.to_numeric(unique_ids_df["id"], errors="coerce").isna()
bad_ids = unique_ids_df.loc[id_numeric, "id"]

bad_ids

19730    1997-08-20
29503    2012-09-29
35587    2014-01-01
Name: id, dtype: object

In [None]:
# 1.3.2 丟掉怪資料，並且轉換data type

subset = unique_ids_df[~unique_ids_df["id"].isin(bad_ids)].copy()
subset["id"] = subset["id"].astype(int)

subset

Unnamed: 0,genres,id,original_language,overview,release_date,runtime,title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,81.0,Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,1995-12-15,104.0,Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,1995-12-22,101.0,Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,127.0,Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,1995-02-10,106.0,Father of the Bride Part II
...,...,...,...,...,...,...,...
45461,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",439050,fa,Rising and falling between a man and woman.,,90.0,Subdue
45462,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,2011-11-17,360.0,Century of Birthing
45463,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",2003-08-01,90.0,Betrayal
45464,[],227506,en,"In a small town live two brothers, one a minis...",1917-10-21,87.0,Satan Triumphant
