In [1]:
import pandas as pd

In [2]:
# Load full dataset 
origin_df = pd.read_csv("data/wiki_movie_plots_deduped.csv") 

In [3]:
origin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [4]:
# Missing values
missing_plots = origin_df["Plot"].isna().sum()
print(f"Missing 'Plot' values: {missing_plots}")

Missing 'Plot' values: 0


In [5]:
# Duplicates
duplicate_titles = origin_df.duplicated(subset="Title").sum()
print(f"Duplicate titles: {duplicate_titles}")

Duplicate titles: 2454


In [6]:
duplicates = origin_df[origin_df.duplicated(subset="Title", keep=False)]

if len(duplicates) > 0:
    print(f"Found {len(duplicates)} duplicate records (by Title):\n")
    print(duplicates.sort_values("Title"))
else:
    print("No duplicate titles found.")

Found 4525 duplicate records (by Title):

       Release Year        Title Origin/Ethnicity                Director  \
17813          2009        $9.99       Australian         Tatia Rosenthal   
17796          2008        $9.99       Australian         Tatia Rosenthal   
34228          2014           10          Russian                 Unknown   
9556           1979           10         American           Blake Edwards   
17168          2017  100 Streets         American            Jim O'Hanlon   
...             ...          ...              ...                     ...   
24272          1940      Zindagi        Bollywood             P. C. Barua   
24586          1964      Zindagi        Bollywood                 Unknown   
24989          1976      Zindagi        Bollywood                 Unknown   
17177          2017         iBoy         American            Adam Randall   
21657          2017         iBoy          British  Director: Adam Randall   

                                 

In [7]:
df_no_dupes = origin_df.drop_duplicates(subset="Title", keep="first")

In [8]:
# Whitespace issues
df_no_dupes["Title_stripped"] = df_no_dupes["Title"].astype(str).str.strip()
df_no_dupes["Plot_stripped"] = df_no_dupes["Plot"].astype(str).str.strip()
whitespace_issues = (
    (df_no_dupes["Title"] != df_no_dupes["Title_stripped"]).sum()
    + (df_no_dupes["Plot"] != df_no_dupes["Plot_stripped"]).sum()
)
print(f"Rows with leading/trailing whitespace: {whitespace_issues}")

Rows with leading/trailing whitespace: 438


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_dupes["Title_stripped"] = df_no_dupes["Title"].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_dupes["Plot_stripped"] = df_no_dupes["Plot"].astype(str).str.strip()


In [9]:
# Keep only the required columns
df = df_no_dupes[["Title", "Plot"]].dropna().reset_index(drop=True)

In [10]:
for col in df.select_dtypes(include="object").columns:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(r"\s+", " ", regex=True)  
        .str.strip()                           
    )

In [11]:
# random sample 
subset = df.sample(500, random_state=42) 

In [12]:
# Duplicates
duplicate_titles = subset.duplicated(subset="Title").sum()
print(f"Duplicate titles: {duplicate_titles}")

Duplicate titles: 0


In [13]:
# Save to data folder 
subset.to_csv("data/movies_data_subset_clean.csv", index=False) 