# Loading Data

In [11]:
import pandas as pd

import pandas as pd
train = pd.read_csv(r"Data/train.csv", nrows=5)

print(train.columns.tolist())
print(train.shape)
print(train.head())
print(train.info())


['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Listening_Time_minutes']
(5, 12)
   id     Podcast_Name Episode_Title  Episode_Length_minutes       Genre  \
0   0  Mystery Matters    Episode 98                     NaN  True Crime   
1   1    Joke Junction    Episode 26                  119.80      Comedy   
2   2   Study Sessions    Episode 16                   73.90   Education   
3   3   Digital Digest    Episode 45                   67.17  Technology   
4   4      Mind & Body    Episode 86                  110.51      Health   

   Host_Popularity_percentage Publication_Day Publication_Time  \
0                       74.81        Thursday            Night   
1                       66.95        Saturday        Afternoon   
2                       69.97         Tuesday          Evening   
3                       57.22   

In [12]:
test = pd.read_csv(r"Data/test.csv", nrows=5)
print(test.columns.tolist())
print(test.shape)
print(test.head())
print(test.info())

['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment']
(5, 11)
       id         Podcast_Name Episode_Title  Episode_Length_minutes  \
0  750000  Educational Nuggets    Episode 73                   78.96   
1  750001          Sound Waves    Episode 23                   27.87   
2  750002        Joke Junction    Episode 11                   69.10   
3  750003        Comedy Corner    Episode 73                  115.39   
4  750004         Life Lessons    Episode 50                   72.32   

       Genre  Host_Popularity_percentage Publication_Day Publication_Time  \
0  Education                       38.11        Saturday          Evening   
1      Music                       71.29          Sunday          Morning   
2     Comedy                       67.89          Friday          Evening   
3     Comedy                       23.

# Cleaning Data
## Things to do:
- Compare columns and remove the label column before finding missing columns
- Find duplicates and remove them
- Find missing columns and fill them up
- Align train and test data have the same columns
- Save the cleaned data

In [17]:
# compare columns
train_cols = set(train.columns)
test_cols = set(test.columns)

print("Columns only in train:", train_cols - test_cols)

label_col = 'Listening_Time_minutes'

Columns only in train: {'Listening_Time_minutes'}


In [23]:
# Check duplicates & remove them
train = train.drop_duplicates()

# remove the label col
feature_cols = [col for col in train.columns if col != label_col]

# Find missing columns
for col in feature_cols:
    # if it's numeric
    if train[col].dtype != "object":
        median = train[col].median()
        # fill in NAN only
        train[col] = train[col].fillna(median)
        test[col] = test[col].fillna(median)
    else:
        # get the most frequent value
        mode = train[col].mode()[0]
        # fill in NAN only
        train[col] = train[col].fillna(mode)
        test[col] = test[col].fillna(mode)
        

# Align train/test data
train, test = train.align(test, join="left", axis=1, fill_value=0)

# Save the cleaned data
train.to_csv("Data/clean_train.csv", index=False)
test.to_csv("Data/clean_test.csv", index=False)

print('Cleaned data saved successfully: \nData/clean_train.csv\nData/clean_test.csv\n')



clean_train = pd.read_csv(r"Data/clean_train.csv", nrows=5)
clean_test = pd.read_csv(r"Data/clean_test.csv", nrows=5)

print(clean_train.columns.tolist())
print(clean_train.head())


Cleaned data saved successfully: 
Data/clean_train.csv
Data/clean_test.csv

['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Listening_Time_minutes']
   id     Podcast_Name Episode_Title  Episode_Length_minutes       Genre  \
0   0  Mystery Matters    Episode 98                  92.205  True Crime   
1   1    Joke Junction    Episode 26                 119.800      Comedy   
2   2   Study Sessions    Episode 16                  73.900   Education   
3   3   Digital Digest    Episode 45                  67.170  Technology   
4   4      Mind & Body    Episode 86                 110.510      Health   

   Host_Popularity_percentage Publication_Day Publication_Time  \
0                       74.81        Thursday            Night   
1                       66.95        Saturday        Afternoon   
2                       69.97 

In [21]:
print(clean_test.columns.tolist())
print(clean_test.head())

['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Listening_Time_minutes']
       id         Podcast_Name Episode_Title  Episode_Length_minutes  \
0  750000  Educational Nuggets    Episode 73                   78.96   
1  750001          Sound Waves    Episode 23                   27.87   
2  750002        Joke Junction    Episode 11                   69.10   
3  750003        Comedy Corner    Episode 73                  115.39   
4  750004         Life Lessons    Episode 50                   72.32   

       Genre  Host_Popularity_percentage Publication_Day Publication_Time  \
0  Education                       38.11        Saturday          Evening   
1      Music                       71.29          Sunday          Morning   
2     Comedy                       67.89          Friday          Evening   
3     Comedy        