# Setup the Final Data for the Official Launch
## Kaggle Pog Champs Series E01

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import os

%matplotlib inline
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [10]:
df = pd.read_csv("youtube-trending-video-dataset/US_youtube_trending_data.csv")
df["trending_date"] = pd.to_datetime(df["trending_date"]).dt.date
df["publishedAt"] = pd.to_datetime(df["publishedAt"])
df["id"] = df["video_id"] + "_" + df["trending_date"].astype("str")
df = df.drop_duplicates(subset=["id"]).reset_index(drop=True).copy()

## Remove Zero View Count Videos

In [11]:
df = df.loc[df["view_count"] != 0].reset_index(drop=True).copy()

## Add the Video Durations

In [13]:
dur = pd.read_csv("durations.csv")
dur = dur.rename(columns={"id": "video_id"})
print(df.shape, dur.shape)

dur = dur.drop_duplicates()
df = df.merge(
    dur[["video_id", "duration_seconds"]], on="video_id", how="left", validate="m:1",
)

(98075, 17) (17791, 4)


# Add `HasThumbnail` Data

In [20]:
videos_with_tn = [c.strip(".jpg") for c in os.listdir("out/")]
df["has_thumbnail"] = df["video_id"].isin(videos_with_tn)
df["has_thumbnail"].value_counts()

True     86010
False    12065
Name: has_thumbnail, dtype: int64

# Make Target Column

In [22]:
# Likes to view count is the target
df["target"] = df["likes"] / df["view_count"]

# Split Train/Test on Cutoff Date

In [27]:
cutoff_date = "2021-12-01"
df["isTest"] = df["trending_date"] >= pd.to_datetime(cutoff_date).date()

train_df = df.query("isTest == False").copy()
test_df = df.query("isTest == True").copy()

train_df = train_df.drop(["isTest"], axis=1).reset_index(drop=True).copy()
solution_df = test_df.drop(["isTest"], axis=1).reset_index(drop=True).copy()
test_df = (
    test_df.drop(["view_count", "likes", "dislikes", "comment_count", "target"], axis=1)
    .reset_index(drop=True)
    .copy()
)

# Save of Train and Test Data

In [28]:
train_df.to_parquet("train.parquet")
test_df.to_parquet("test.parquet")

# Save off The Ground Truth Solution

In [30]:
solution_df[["id", "target"]].to_csv("solution.csv", index=False)
train_df[["id", "target"]].head().to_csv("sample_submission.csv", index=False)

# Average Solution

In [36]:
average_solution = solution_df[["id", "target"]].copy()
average_solution["target"] = int(np.round(train_df["target"].mean()))
average_solution.to_csv("avg_solution.csv", index=False)