In [1]:
# import libraries
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 100)

TARGET = 'Listening_Time_minutes'
CATS = ['Podcast_Name', 'Episode_Num', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
NUMS = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
        'Guest_Popularity_percentage', 'Number_of_Ads']

oof_pred_name = '2+3+4_interact_xgb'

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
train = pd.read_csv('../data/raw/train.csv', index_col='id')
test = pd.read_csv('../data/raw/test.csv', index_col='id')
original = pd.read_csv('../data/raw/podcast_dataset.csv')
print(f"Train shape: {train.shape}")
print(f"Test  shape: {test.shape}")
print(f"Orig  shape: {original.shape}")
train.head(3)

Train shape: (750000, 11)
Test  shape: (250000, 10)
Orig  shape: (52500, 11)


Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531


In [4]:
original_clean = original.dropna(subset=[TARGET]).drop_duplicates()
train = pd.concat([train, original_clean], axis=0, ignore_index=True)

In [5]:
train.describe()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,703281.0,794868.0,644364.0,794867.0,794868.0
mean,64.413254,59.877089,52.102374,1.357405,45.443291
std,32.981496,22.888959,28.481602,1.149726,27.138465
min,0.0,1.3,0.0,0.0,0.0
25%,35.67,39.45,28.13,0.0,23.18481
50%,63.77,60.06,53.36,1.0,43.39171
75%,94.01,79.56,76.5,2.0,64.81429
max,325.24,119.46,119.91,103.91,119.97


Add Features

In [6]:
def feature_eng(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df['Episode_Num'] = df['Episode_Title'].str[8:]     
    df['is_weekend']   = df['Publication_Day'].isin(['Saturday', 'Sunday']).astype(int)

    return df.drop(columns=['Episode_Title'])

train = feature_eng(train)
test = feature_eng(test)

In [7]:
ELM = []
for k in range(3):
    col_name = f'ELm_r{k}'
    train[col_name] = train['Episode_Length_minutes'].round(k)
    test[col_name] = test['Episode_Length_minutes'].round(k)
    ELM.append(col_name)

In [8]:
train.columns

Index(['Podcast_Name', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Listening_Time_minutes', 'Episode_Num', 'is_weekend', 'ELm_r0',
       'ELm_r1', 'ELm_r2'],
      dtype='object')

In [9]:
encoded_columns = []

selected_comb = [
    # 2-interaction
    ['Episode_Length_minutes', 'Host_Popularity_percentage'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage'],
    ['Episode_Num', 'Guest_Popularity_percentage'],
    ['Episode_Num', 'Number_of_Ads'],    
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Host_Popularity_percentage', 'Number_of_Ads'],
    ['Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Podcast_Name'],
    ['Episode_Num', 'Podcast_Name'],  
    ['Guest_Popularity_percentage', 'Podcast_Name'],
    ['ELm_r1', 'Episode_Num'],
    ['ELm_r1', 'Host_Popularity_percentage'], 
    ['ELm_r1', 'Guest_Popularity_percentage'],
    ['ELm_r2', 'Episode_Num'],
    ['ELm_r2', 'Episode_Sentiment'],
    ['ELm_r2', 'Publication_Day'],

    
    # 3-interaction
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Sentiment', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Genre'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Genre'],
    ['Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],

    ['Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],   
    ['ELm_r1', 'Number_of_Ads', 'Episode_Sentiment'],
    ['ELm_r2', 'Number_of_Ads', 'Podcast_Name'],
    
    # 4-interaction
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Genre'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Genre'],    
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Genre'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Time', 'Podcast_Name'],
    
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Genre'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time', 'Genre'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time', 'Podcast_Name'],
    
]

for comb in selected_comb:
    name = '_'.join(comb)
        
    if len(comb) == 2:
        train[name] = train[comb[0]].astype(str) + '_' + train[comb[1]].astype(str)
        test[name] = test[comb[0]].astype(str) + '_' + test[comb[1]].astype(str)
        
    elif len(comb) == 3:
        train[name] = (train[comb[0]].astype(str) + '_' +
                       train[comb[1]].astype(str) + '_' +
                       train[comb[2]].astype(str))
        test[name] = (test[comb[0]].astype(str) + '_' +
                      test[comb[1]].astype(str) + '_' +
                      test[comb[2]].astype(str))
        
    elif len(comb) == 4:
        train[name] = (train[comb[0]].astype(str) + '_' +
                       train[comb[1]].astype(str) + '_' +
                       train[comb[2]].astype(str) + '_' +
                       train[comb[3]].astype(str))
        test[name] = (test[comb[0]].astype(str) + '_' +
                      test[comb[1]].astype(str) + '_' +
                      test[comb[2]].astype(str) + '_' +
                      test[comb[3]].astype(str))
    
    encoded_columns.append(name)

train[encoded_columns] = train[encoded_columns].astype('category')
test[encoded_columns] = test[encoded_columns].astype('category')

In [10]:
train.shape

(794868, 113)

In [14]:
# reference: https://www.kaggle.com/code/masayakawamata/imputation-strategies/
train[NUMS] = train[NUMS].fillna(train[NUMS].median())
test[NUMS] = test[NUMS].fillna(train[NUMS].median())

In [15]:
FEATURES = NUMS + CATS + encoded_columns

print(f"Train Shape: {train.shape}")
print(f"Test  Shape: {test.shape}")
train.head(3)

Train Shape: (794868, 113)
Test  Shape: (250000, 112)


Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Num,is_weekend,ELm_r0,ELm_r1,ELm_r2,Episode_Length_minutes_Host_Popularity_percentage,Episode_Length_minutes_Guest_Popularity_percentage,Episode_Length_minutes_Number_of_Ads,Episode_Num_Host_Popularity_percentage,Episode_Num_Guest_Popularity_percentage,Episode_Num_Number_of_Ads,Host_Popularity_percentage_Guest_Popularity_percentage,Host_Popularity_percentage_Number_of_Ads,Host_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Podcast_Name,Episode_Num_Podcast_Name,Guest_Popularity_percentage_Podcast_Name,ELm_r1_Episode_Num,ELm_r1_Host_Popularity_percentage,ELm_r1_Guest_Popularity_percentage,ELm_r2_Episode_Num,ELm_r2_Episode_Sentiment,ELm_r2_Publication_Day,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage,Episode_Length_minutes_Episode_Num_Number_of_Ads,Episode_Length_minutes_Episode_Num_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage,Episode_Length_minutes_Host_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Host_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Host_Popularity_percentage_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Publication_Time,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Guest_Popularity_percentage_Publication_Day,Episode_Length_minutes_Guest_Popularity_percentage_Publication_Time,Episode_Length_minutes_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Number_of_Ads_Publication_Day,Episode_Length_minutes_Episode_Sentiment_Publication_Time,Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage,...,Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Day,Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Time,Host_Popularity_percentage_Number_of_Ads_Publication_Day,Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Guest_Popularity_percentage_Number_of_Ads_Genre,ELm_r1_Number_of_Ads_Episode_Sentiment,ELm_r2_Number_of_Ads_Podcast_Name,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Publication_Day,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Publication_Time,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Genre,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Publication_Day,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Publication_Time,Episode_Length_minutes_Episode_Num_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Number_of_Ads_Publication_Day,Episode_Length_minutes_Episode_Num_Number_of_Ads_Publication_Time,Episode_Length_minutes_Episode_Num_Publication_Day_Publication_Time,Episode_Length_minutes_Episode_Num_Publication_Day_Genre,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Time,Episode_Length_minutes_Host_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Host_Popularity_percentage_Number_of_Ads_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Publication_Day_Publication_Time,Episode_Length_minutes_Host_Popularity_percentage_Publication_Day_Genre,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Publication_Day,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Publication_Time,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Genre,Episode_Length_minutes_Episode_Num_Publication_Time_Podcast_Name,Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage_Number_of_Ads,Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage_Episode_Sentiment,Episode_Num_Host_Popularity_percentage_Number_of_Ads_Publication_Day,Episode_Num_Host_Popularity_percentage_Number_of_Ads_Publication_Time,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Publication_Day,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Publication_Time,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Genre,Episode_Num_Host_Popularity_percentage_Publication_Day_Publication_Time,Episode_Num_Host_Popularity_percentage_Publication_Time_Genre,Episode_Num_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Episode_Num_Guest_Popularity_percentage_Number_of_Ads_Genre,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Podcast_Name,Host_Popularity_percentage_Number_of_Ads_Episode_Sentiment_Podcast_Name,Host_Popularity_percentage_Number_of_Ads_Publication_Day_Podcast_Name,Host_Popularity_percentage_Number_of_Ads_Publication_Time_Podcast_Name
0,Mystery Matters,63.77,True Crime,74.81,Thursday,Night,53.36,0.0,Positive,31.41998,98,0,,,,nan_74.81,nan_nan,nan_0.0,98_74.81,98_nan,98_0.0,74.81_nan,74.81_0.0,74.81_Positive,nan_Mystery Matters,98_Mystery Matters,nan_Mystery Matters,nan_98,nan_74.81,nan_nan,nan_98,nan_Positive,nan_Thursday,nan_98_74.81,nan_98_nan,nan_98_0.0,nan_98_Positive,nan_98_Thursday,nan_74.81_nan,nan_74.81_0.0,nan_74.81_Positive,nan_74.81_Thursday,nan_74.81_Night,nan_nan_0.0,nan_nan_Thursday,nan_nan_Night,nan_0.0_Positive,nan_0.0_Thursday,nan_Positive_Night,98_74.81_nan,...,74.81_nan_Thursday,74.81_nan_Night,74.81_0.0_Thursday,nan_0.0_Positive,nan_0.0_True Crime,nan_0.0_Positive,nan_0.0_Mystery Matters,nan_98_74.81_nan,nan_98_74.81_0.0,nan_98_74.81_Positive,nan_98_74.81_Thursday,nan_98_74.81_Night,nan_98_74.81_True Crime,nan_98_nan_0.0,nan_98_nan_Positive,nan_98_nan_Thursday,nan_98_nan_Night,nan_98_0.0_Positive,nan_98_0.0_Thursday,nan_98_0.0_Night,nan_98_Thursday_Night,nan_98_Thursday_True Crime,nan_74.81_nan_0.0,nan_74.81_nan_Positive,nan_74.81_nan_Thursday,nan_74.81_nan_Night,nan_74.81_0.0_Positive,nan_74.81_0.0_Thursday,nan_74.81_Thursday_Night,nan_74.81_Thursday_True Crime,nan_nan_0.0_Positive,nan_nan_0.0_Thursday,nan_nan_0.0_Night,nan_nan_0.0_True Crime,nan_98_Night_Mystery Matters,98_74.81_nan_0.0,98_74.81_nan_Positive,98_74.81_0.0_Thursday,98_74.81_0.0_Night,98_74.81_Positive_Thursday,98_74.81_Positive_Night,98_74.81_Positive_True Crime,98_74.81_Thursday_Night,98_74.81_Night_True Crime,98_nan_0.0_Positive,98_nan_0.0_True Crime,98_74.81_Positive_Mystery Matters,74.81_0.0_Positive_Mystery Matters,74.81_0.0_Thursday_Mystery Matters,74.81_0.0_Night_Mystery Matters
1,Joke Junction,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241,26,1,120.0,119.8,119.8,119.8_66.95,119.8_75.95,119.8_2.0,26_66.95,26_75.95,26_2.0,66.95_75.95,66.95_2.0,66.95_Negative,119.8_Joke Junction,26_Joke Junction,75.95_Joke Junction,119.8_26,119.8_66.95,119.8_75.95,119.8_26,119.8_Negative,119.8_Saturday,119.8_26_66.95,119.8_26_75.95,119.8_26_2.0,119.8_26_Negative,119.8_26_Saturday,119.8_66.95_75.95,119.8_66.95_2.0,119.8_66.95_Negative,119.8_66.95_Saturday,119.8_66.95_Afternoon,119.8_75.95_2.0,119.8_75.95_Saturday,119.8_75.95_Afternoon,119.8_2.0_Negative,119.8_2.0_Saturday,119.8_Negative_Afternoon,26_66.95_75.95,...,66.95_75.95_Saturday,66.95_75.95_Afternoon,66.95_2.0_Saturday,75.95_2.0_Negative,75.95_2.0_Comedy,119.8_2.0_Negative,119.8_2.0_Joke Junction,119.8_26_66.95_75.95,119.8_26_66.95_2.0,119.8_26_66.95_Negative,119.8_26_66.95_Saturday,119.8_26_66.95_Afternoon,119.8_26_66.95_Comedy,119.8_26_75.95_2.0,119.8_26_75.95_Negative,119.8_26_75.95_Saturday,119.8_26_75.95_Afternoon,119.8_26_2.0_Negative,119.8_26_2.0_Saturday,119.8_26_2.0_Afternoon,119.8_26_Saturday_Afternoon,119.8_26_Saturday_Comedy,119.8_66.95_75.95_2.0,119.8_66.95_75.95_Negative,119.8_66.95_75.95_Saturday,119.8_66.95_75.95_Afternoon,119.8_66.95_2.0_Negative,119.8_66.95_2.0_Saturday,119.8_66.95_Saturday_Afternoon,119.8_66.95_Saturday_Comedy,119.8_75.95_2.0_Negative,119.8_75.95_2.0_Saturday,119.8_75.95_2.0_Afternoon,119.8_75.95_2.0_Comedy,119.8_26_Afternoon_Joke Junction,26_66.95_75.95_2.0,26_66.95_75.95_Negative,26_66.95_2.0_Saturday,26_66.95_2.0_Afternoon,26_66.95_Negative_Saturday,26_66.95_Negative_Afternoon,26_66.95_Negative_Comedy,26_66.95_Saturday_Afternoon,26_66.95_Afternoon_Comedy,26_75.95_2.0_Negative,26_75.95_2.0_Comedy,26_66.95_Negative_Joke Junction,66.95_2.0_Negative_Joke Junction,66.95_2.0_Saturday_Joke Junction,66.95_2.0_Afternoon_Joke Junction
2,Study Sessions,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531,16,0,74.0,73.9,73.9,73.9_69.97,73.9_8.97,73.9_0.0,16_69.97,16_8.97,16_0.0,69.97_8.97,69.97_0.0,69.97_Negative,73.9_Study Sessions,16_Study Sessions,8.97_Study Sessions,73.9_16,73.9_69.97,73.9_8.97,73.9_16,73.9_Negative,73.9_Tuesday,73.9_16_69.97,73.9_16_8.97,73.9_16_0.0,73.9_16_Negative,73.9_16_Tuesday,73.9_69.97_8.97,73.9_69.97_0.0,73.9_69.97_Negative,73.9_69.97_Tuesday,73.9_69.97_Evening,73.9_8.97_0.0,73.9_8.97_Tuesday,73.9_8.97_Evening,73.9_0.0_Negative,73.9_0.0_Tuesday,73.9_Negative_Evening,16_69.97_8.97,...,69.97_8.97_Tuesday,69.97_8.97_Evening,69.97_0.0_Tuesday,8.97_0.0_Negative,8.97_0.0_Education,73.9_0.0_Negative,73.9_0.0_Study Sessions,73.9_16_69.97_8.97,73.9_16_69.97_0.0,73.9_16_69.97_Negative,73.9_16_69.97_Tuesday,73.9_16_69.97_Evening,73.9_16_69.97_Education,73.9_16_8.97_0.0,73.9_16_8.97_Negative,73.9_16_8.97_Tuesday,73.9_16_8.97_Evening,73.9_16_0.0_Negative,73.9_16_0.0_Tuesday,73.9_16_0.0_Evening,73.9_16_Tuesday_Evening,73.9_16_Tuesday_Education,73.9_69.97_8.97_0.0,73.9_69.97_8.97_Negative,73.9_69.97_8.97_Tuesday,73.9_69.97_8.97_Evening,73.9_69.97_0.0_Negative,73.9_69.97_0.0_Tuesday,73.9_69.97_Tuesday_Evening,73.9_69.97_Tuesday_Education,73.9_8.97_0.0_Negative,73.9_8.97_0.0_Tuesday,73.9_8.97_0.0_Evening,73.9_8.97_0.0_Education,73.9_16_Evening_Study Sessions,16_69.97_8.97_0.0,16_69.97_8.97_Negative,16_69.97_0.0_Tuesday,16_69.97_0.0_Evening,16_69.97_Negative_Tuesday,16_69.97_Negative_Evening,16_69.97_Negative_Education,16_69.97_Tuesday_Evening,16_69.97_Evening_Education,16_8.97_0.0_Negative,16_8.97_0.0_Education,16_69.97_Negative_Study Sessions,69.97_0.0_Negative_Study Sessions,69.97_0.0_Tuesday_Study Sessions,69.97_0.0_Evening_Study Sessions


In [16]:
train.to_csv('../data/interim/train_exp1.csv', index=False)
test.to_csv('../data/interim/test_exp1.csv', index=False)