# Dataset Preprocessing

### Multi-label processing

The listed_ columns are the categories to which the work belongs, which are separated by ","


So for each sample, the label should be a binary vector indicating whether each possible category exists or not.

In [2]:
import pandas as pd

In [2]:
file_path = "netflix_titles.csv"
original_data = pd.read_csv(file_path)

column_names = original_data.columns.tolist()
print(column_names)

['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']


In [3]:
# Select only the columns we need
selected_columns = ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']
filtered_data = original_data[selected_columns]

# rename the colume for later use
filtered_data.columns = ['id', 'type', 'title', 'director', 'cast', 'country', 'date added to netflix', 'release year','rating', 'duration','category', 'description']

print(filtered_data.head())

   id     type                  title         director  \
0  s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1  s2  TV Show          Blood & Water              NaN   
2  s3  TV Show              Ganglands  Julien Leclercq   
3  s4  TV Show  Jailbirds New Orleans              NaN   
4  s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

  date added to netflix  release year rating   duration  \
0    September 25, 2021          2020  PG-13     90 min   
1    September 24, 2021          2021  TV-MA  2 Seasons   
2    September 24, 2021          2021  TV-MA   

In [4]:
# Apply a lambda function to split the 'category' column into a list of labels, stripping any leading or trailing whitespaces
filtered_data['category'] = filtered_data['category'].apply(lambda x: [label.strip() for label in x.split(',')])

# Create a set containing all unique labels from the 'category' column
all_labels = set(label for labels in filtered_data['category'] for label in labels)

# Print all unique labels
print("All label：", all_labels)

# Iterate through each unique label
for label in all_labels:
    # # Create a new binary column for each unique label, indicating the presence (1) or absence (0) of the label in the 'category' list
    filtered_data[label] = filtered_data['category'].apply(lambda x: 1 if label in x else 0)

# Drop the original 'category' column, as it has been replaced by binary columns for each unique label
filtered_data.drop('category', axis=1, inplace=True)

# print(filtered_data.head())

All label： {'Thrillers', 'TV Sci-Fi & Fantasy', 'TV Thrillers', 'Movies', 'Dramas', 'Sci-Fi & Fantasy', 'Romantic TV Shows', 'TV Comedies', 'Spanish-Language TV Shows', 'Docuseries', 'Classic Movies', "Kids' TV", 'Children & Family Movies', 'British TV Shows', 'TV Horror', 'Independent Movies', 'Anime Features', 'Anime Series', 'International TV Shows', 'Crime TV Shows', 'Teen TV Shows', 'Classic & Cult TV', 'Science & Nature TV', 'TV Shows', 'Sports Movies', 'Horror Movies', 'TV Action & Adventure', 'Music & Musicals', 'International Movies', 'Romantic Movies', 'Korean TV Shows', 'Reality TV', 'Stand-Up Comedy & Talk Shows', 'Cult Movies', 'TV Mysteries', 'Action & Adventure', 'TV Dramas', 'Documentaries', 'Faith & Spirituality', 'Comedies', 'LGBTQ Movies', 'Stand-Up Comedy'}


In [4]:
# Save multi-tagged processed datasets for text pre-processing and dataset segmentation
output_path = "preprocessed_netflix_titles.csv"
filtered_data.to_csv(output_path, index=False) 

### Data clean

In [3]:
df = pd.read_csv("preprocessed_netflix_titles.csv")

# Calculate the number of nulls per column
null_counts = df.isnull().sum()

print("Number of nulls in each column:")
print(null_counts)

Number of nulls in each column:
id                                 0
type                               0
title                              0
director                        2634
cast                             825
country                          831
date added to netflix             10
release year                       0
rating                             4
duration                           3
description                        0
Thrillers                          0
TV Sci-Fi & Fantasy                0
TV Thrillers                       0
Movies                             0
Dramas                             0
Sci-Fi & Fantasy                   0
Romantic TV Shows                  0
TV Comedies                        0
Spanish-Language TV Shows          0
Docuseries                         0
Classic Movies                     0
Kids' TV                           0
Children & Family Movies           0
British TV Shows                   0
TV Horror                          0
Indepe

In [4]:
df.fillna('Unknown', inplace=True)

In [5]:
#  Get the unique value in the column "rating"
ratings = df["rating"].unique()

print(" The column 'rating' includes the following:")
print(ratings)

 The column 'rating' includes the following:
['PG-13' 'TV-MA' 'PG' 'TV-14' 'TV-PG' 'TV-Y' 'TV-Y7' 'R' 'TV-G' 'G'
 'NC-17' '74 min' '84 min' '66 min' 'NR' 'Unknown' 'TV-Y7-FV' 'UR']


In [6]:
df['rating'] = df['rating'].replace(['74 min', '84 min', '66 min'], 'Unknown')

print(df['rating'].unique())

['PG-13' 'TV-MA' 'PG' 'TV-14' 'TV-PG' 'TV-Y' 'TV-Y7' 'R' 'TV-G' 'G'
 'NC-17' 'Unknown' 'NR' 'TV-Y7-FV' 'UR']


In [7]:
df.to_csv("preprocessed_netflix_titles.csv", index=False)