## Data Pre-Processing

In [1]:
import os
import ast
import isodate
import pandas as pd
import numpy as np

from dateutil import parser

#### Read the channels and videos files into dataframes

In [2]:
cwd = os.path.dirname(os.getcwd())
path = cwd + "/data/raw"
channels_df = pd.read_csv(path + "/channels_data.csv")
videos_df = pd.read_csv(path + "/videos_data.csv")

In [3]:
channels_df.head()

Unnamed: 0.1,Unnamed: 0,ChannelName,ChannelDescription,PublishedDate,TotalSubscribers,TotalViews,TotalVideos,playlistID
0,0,Chloe Ting,Subscribe to my channel and find weekly workou...,2011-08-17T04:29:09Z,24600000,2965643413,405,UUCgLoMYIyP0U56dEhEL1wXQ
1,1,blogilates,"Hey guys! My name is Cassey Ho, I am a certifi...",2009-06-13T09:05:48Z,8580000,2723093080,1177,UUIJwWYOfsCfz6PjxbONYXSg
2,2,Rebecca-Louise,"Hey, \n\nWelcome to #TEAMBURN 🙌🏻 \n\nI am so e...",2012-09-22T18:04:00Z,719000,116976058,1239,UUi0AqmA_3DGPFCu5qY0LLSg
3,3,emi wong,welcome to my channel!\nhope my videos can hel...,2014-11-02T14:43:34Z,6060000,810394833,490,UUvGEK5_U-kLgO6-AMDPeTUQ
4,4,MadFit,"This is a place where I post REAL TIME, AT HOM...",2018-03-02T01:46:06Z,7950000,932630071,710,UUpQ34afVgk8cRQBjSJ1xuJQ


In [4]:
videos_df.head()

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition
0,0,5GLA8MrlDnM,Chloe Ting,A day in my life living in Korea,Short vlog from a day out and about while in S...,"['dayinmylife', 'korea', 'seoul', 'vlog', 'chl...",2023-06-05T14:51:22Z,163593,6344.0,,714.0,PT12M37S,hd
1,1,IOJ7Fxa8e2Y,Chloe Ting,GROW YOUR BOOTY with these exercises,See the full video here: https://youtu.be/4zuY...,"['glute workout', 'booty workout', 'gym workou...",2023-05-24T15:32:44Z,119223,4016.0,,58.0,PT23S,hd
2,2,ljNgkSctkXg,Chloe Ting,INTENSE Full Body Workout - 30 Min No Equipment,This is a 30 min full body intense workout fro...,"['workout', 'home workout', 'full body workout...",2023-05-17T14:00:27Z,392463,11738.0,,735.0,PT31M14S,hd
3,3,0rL2496zybs,Chloe Ting,10 Min Core & Upper Body | No Equip Home Workout!,This is episode 4 of the 2023 Summer Shred Cha...,"['core', 'abs', 'upper body', 'upper body work...",2023-05-15T14:00:01Z,244965,8028.0,,282.0,PT10M52S,hd
4,4,PEX2uefaUAY,Chloe Ting,Perky Booty & Leg Workout | 20 min Glute Workout,This is episode 3 of the 2023 Summer Shred Cha...,"['tiny waist', 'waist', 'booty', 'booty workou...",2023-05-09T14:00:15Z,470075,13519.0,,428.0,PT20M39S,hd


#### Drop redundant column and rows

In [5]:
channels_df.drop('Unnamed: 0', axis = 1, inplace = True)
videos_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [6]:
(channels_df.duplicated().any(),videos_df.duplicated().any())

(False, False)

#### Missing data

In [7]:
channels_df.isnull().any()

ChannelName           False
ChannelDescription    False
PublishedDate         False
TotalSubscribers      False
TotalViews            False
TotalVideos           False
playlistID            False
dtype: bool

In [8]:
videos_df.isnull().any()

video_id          False
channelTitle      False
title             False
description        True
tags               True
publishedAt       False
viewCount         False
likeCount          True
favouriteCount     True
commentCount       True
duration          False
definition        False
dtype: bool

In [9]:
# Find the percentage of missing values from columns that contains them
missingval_columns = videos_df.loc[:, ['description', 'tags', 'likeCount','favouriteCount','commentCount']]
missingval_columns.isnull().sum() / missingval_columns.shape[0] * 100.00

description         4.200845
tags               12.552821
likeCount           0.919712
favouriteCount    100.000000
commentCount        0.248571
dtype: float64

In [10]:
videos_df.drop('favouriteCount', axis=1, inplace=True)

In [11]:
videos_df['definition'].unique()

array(['hd', 'sd'], dtype=object)

In [12]:
videos_df['definition'].value_counts() / videos_df['definition'].shape[0] * 100

hd    99.552573
sd     0.447427
Name: definition, dtype: float64

In [13]:
videos_df.drop('definition', axis=1, inplace=True)

In [14]:
videos_df.publishedAt.sort_values()

1581    2009-10-06T04:47:37Z
1580    2009-11-04T18:05:19Z
1579    2009-11-10T05:18:29Z
1578    2009-11-18T06:49:44Z
1577    2009-11-30T08:13:39Z
                ...         
2821    2023-06-07T10:33:33Z
1583    2023-06-07T12:00:24Z
3313    2023-06-07T14:00:19Z
3312    2023-06-08T14:00:09Z
1582    2023-06-08T15:00:08Z
Name: publishedAt, Length: 4023, dtype: object

#### Checking data types

In [15]:
channels_df.dtypes

ChannelName           object
ChannelDescription    object
PublishedDate         object
TotalSubscribers       int64
TotalViews             int64
TotalVideos            int64
playlistID            object
dtype: object

In [16]:
videos_df.dtypes

video_id         object
channelTitle     object
title            object
description      object
tags             object
publishedAt      object
viewCount         int64
likeCount       float64
commentCount    float64
duration         object
dtype: object

## Feature Engineering

In [17]:
# Title character length
videos_df['titleLength'] = videos_df['title'].apply(lambda x: len(x))

In [18]:
# Convert duration to seconds
videos_df['durationSecs'] = videos_df['duration'].apply(lambda x: isodate.parse_duration(x))
videos_df['durationSecs'] = videos_df['durationSecs'].astype('timedelta64[s]')

In [19]:
# Create publish year and month (of the year) columns
videos_df['publishedDatetime'] =  videos_df['publishedAt'].apply(lambda x: parser.parse(x))
videos_df['publishedYear'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%Y"))
videos_df['publishedMonth'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%b"))

# Create publish day (of the week) and hour and time of day columns
videos_df['pushblishDayName'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%a")) 
videos_df['publishedHour'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%H"))
videos_df['publishedToD'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%p"))


In [20]:
# Create the number of tags column
videos_df['tags'] = videos_df['tags'].replace(np.nan, None)
videos_df['tags'] = videos_df['tags'].apply(lambda x: x if x is None else ast.literal_eval(x))
videos_df['tagsCount'] = videos_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [21]:
# Create column to categorize Youtube Shorts
videos_df['youtubeShorts'] = [True if x <= 60 else False for x in videos_df['durationSecs']]

#### Natural Language Processing of text (title column) to enrich the data

In [34]:
workout_videos_df = videos_df[videos_df['youtubeShorts'] == False].copy()

In [24]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [25]:
#Create the EntityRuler
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [38]:
#List of Entities and Patterns
patterns = [
    {"label": "WORKOUT_TIME", "pattern": [{"TEXT": {"REGEX": r"^(\d+)"}},
                                          {"LOWER":{"REGEX": r"^(min|mins|minute|minutes|hour|hours|hr|hrs)$"}}
                                         ]},
    {"label": "FULL_BODY", "pattern": [{"LOWER": {"REGEX": r"(full|total|whole)"}}, {"LOWER": "body"}]},
    {"label": "UPPER_BODY", "pattern": [{"LOWER": "upper"}, {"LOWER": "body"}]},
    {"label": "LOWER_BODY", "pattern": [{"LOWER": "lower"}, {"LOWER": "body"}]},
    {"label": "CHEST_BACK", "pattern": [{"LOWER": {"REGEX": r"(back|chest)"}}, {"ORTH": {"REGEX": r"(and|&)"}} ,{"LOWER": {"REGEX": r"(back|chest)"}}]},
    {"label": "ABS", "pattern": [{"LOWER": {"REGEX": r"(core|ab|abs|plank)"}}]},
    {"label": "ARMS", "pattern": [{"LOWER": {"REGEX": r"arms?"}}]},
    {"label": "LEGS", "pattern": [{"LOWER": {"REGEX": r"(thigh|thighs|leg|legs)"}}]},
    {"label": "GLUTES", "pattern": [{"LOWER": {"REGEX": r"(booty|glute|glutes|butt)"}}]},
    {"label": "WORKOUT_TYPE", "pattern": [{"LOWER": {"REGEX": r"(hiit|cardio|pilates|yoga|dance|tabata|barre|stretch)"}}]},
    {"label": "STANDING" , "pattern": {"LOWER": "standing"}},
    {"label": "NO_EQUIPMENT", "pattern": {"LOWER": "no", "LOWER": {"REGEX": r"(equipment|weight|weights)"}}},
    {"label": "NO_JUMPING", "pattern": {"LOWER": "no", "LOWER": "jumping"}},
    {"label": "LOW_IMPACT", "pattern": {"LOWER": "low", "LOWER": "impact"}},
    {"label": "STRENGTH_TRAINING", "pattern": {"LOWER": {"REGEX": r"(strength|sculpt|sculpting|tone|toning|toned)"}}
]

In [39]:
ruler.add_patterns(patterns)

In [28]:
# Extract workout time using regular expressions
def extract_workout_time(title):
    doc = nlp(title)
    workout_time = None
    for ent in doc.ents:
        if ent.label_ == "WORKOUT_TIME":
            workout_time = ent.text
            break
    return workout_time

# Function to extract body part from a title
def extract_body_part(title):
    doc = nlp(title)
    body_part = None
    body_parts = ["FULL_BODY","UPPER_BODY","LOWER_BODY","CHEST_BACK","ARMS","ABS","LEGS","GLUTES"]
    for ent in doc.ents:
        if ent.label_ in body_parts:
            body_part = ent.label_
            break
    return body_part

# Extract workout type using regular expressions
def extract_workout_type(title):
    doc = nlp(title)
    workout_type = None
    for ent in doc.ents:
        if ent.label_ == "WORKOUT_TYPE":
            workout_type = ent.text
            break
    return workout_type

# Extract standing workout using regular expressions
def extract_standing_workout(title):
    doc = nlp(title)
    standing = None
    for ent in doc.ents:
        if ent.label_ == "STANDING":
            standing = ent.label_
            break
    return standing

# Extract no jumping workout using regular expressions
def extract_no_jumping_workout(title):
    doc = nlp(title)
    no_jumping = None
    for ent in doc.ents:
        if ent.label_ == "NO_JUMPING":
            no_jumping = ent.label_
            break
    return no_jumping

# Extract no equipment workout using regular expressions
def extract_no_equipment_workout(title):
    doc = nlp(title)
    no_equipment = None
    for ent in doc.ents:
        if ent.label_ == "NO_EQUIPMENT":
            no_equipment = ent.label_
            break
    return no_equipment

# Extract low impact workout using regular expressions
def extract_low_impact_workout(title):
    doc = nlp(title)
    low_impact = None
    for ent in doc.ents:
        if ent.label_ == "LOW_IMPACT":
            low_impact = ent.label_
            break
    return low_impact

# Extract strength training workout using regular expressions
def extract_strength_training_workout(title):
    doc = nlp(title)
    strength_training = None
    for ent in doc.ents:
        if ent.label_ == "STRENGTH_TRAINING":
            strength_training = ent.label_
            break
    return strength_training


In [36]:
# Apply the function to create a workout time column
workout_videos_df['workoutTime'] = workout_videos_df['title'].apply(extract_workout_time)

# Apply the function to create a body part column
workout_videos_df['bodyPart'] = workout_videos_df['title'].apply(extract_body_part)

# Apply the function to create a workout type column
workout_videos_df['workoutType'] = workout_videos_df['title'].apply(extract_workout_type)

# Apply the function to create a standing workout column
workout_videos_df['standing'] = workout_videos_df['title'].apply(extract_standing_workout)

# Apply the function to create a no equipment column
workout_videos_df['noEquipment'] = workout_videos_df['title'].apply(extract_no_equipment_workout)

# Apply the function to create a no jumping workout column
workout_videos_df['noJumping'] = workout_videos_df['title'].apply(extract_no_jumping_workout)

# Apply the function to create a low impact column
workout_videos_df['lowImpact'] = workout_videos_df['title'].apply(extract_low_impact_workout)

# Apply the function to create a strength training column
workout_videos_df['strengthTraining'] = workout_videos_df['title'].apply(extract_strength_training_workout)


In [42]:
workout_videos_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,...,publishedDatetime,publishedYear,publishedMonth,pushblishDayName,publishedHour,publishedToD,tagsCount,youtubeShorts,workoutTime,bodyPart
0,5GLA8MrlDnM,Chloe Ting,A day in my life living in Korea,Short vlog from a day out and about while in S...,"[dayinmylife, korea, seoul, vlog, chloeting, c...",2023-06-05T14:51:22Z,163593,6344.0,714.0,PT12M37S,...,2023-06-05 14:51:22+00:00,2023,Jun,Mon,14,PM,13,False,,
2,ljNgkSctkXg,Chloe Ting,INTENSE Full Body Workout - 30 Min No Equipment,This is a 30 min full body intense workout fro...,"[workout, home workout, full body workout, ful...",2023-05-17T14:00:27Z,392463,11738.0,735.0,PT31M14S,...,2023-05-17 14:00:27+00:00,2023,May,Wed,14,PM,27,False,30 Min,"(Full Body, FULL_BODY)"
3,0rL2496zybs,Chloe Ting,10 Min Core & Upper Body | No Equip Home Workout!,This is episode 4 of the 2023 Summer Shred Cha...,"[core, abs, upper body, upper body workout, co...",2023-05-15T14:00:01Z,244965,8028.0,282.0,PT10M52S,...,2023-05-15 14:00:01+00:00,2023,May,Mon,14,PM,27,False,10 Min,"(Core, ABS)"
4,PEX2uefaUAY,Chloe Ting,Perky Booty & Leg Workout | 20 min Glute Workout,This is episode 3 of the 2023 Summer Shred Cha...,"[tiny waist, waist, booty, booty workout, butt...",2023-05-09T14:00:15Z,470075,13519.0,428.0,PT20M39S,...,2023-05-09 14:00:15+00:00,2023,May,Tue,14,PM,29,False,20 min,"(Booty, GLUTES)"
5,fWP_huSuAHE,Chloe Ting,Get Defined ABS for the Summer - 10 Min Ab Wor...,This is episode 2 of the 2023 Summer Shred Cha...,"[abs, abs workout, abs exercise, lower abs wor...",2023-05-08T14:00:21Z,901876,28696.0,1069.0,PT11M2S,...,2023-05-08 14:00:21+00:00,2023,May,Mon,14,PM,32,False,10 Min,"(ABS, ABS)"
