# Data Pre-Processing

First, we are going to clean the data by:
- checking for duplicate rows
- checking for missing values
- checking the percentage of missing values of each column
- checking for any redundant (uninformative) columns
- checking the column data types


Second, we are going to perform feature engineering to enrich the data by:
- use the published date column to get new features such as year, month, day of the week or time of day
- use natural language processing (via spaCy) to derive useful features from the video title


## 1. Data Cleaning

In [1]:
import os
import ast
import pandas as pd
import numpy as np
import spacy         # Natural language processing
import isodate       # Date transformation and manipulation

from dateutil import parser

#### Read the channels and videos files into dataframes

In [2]:
# Get current working directory
cwd = os.path.dirname(os.getcwd())
# Create filepath to the raw subfolder in data folder
path = cwd + "/data/raw"

# Read in the csv files from the raw data folder
channels_df = pd.read_csv(path + "/fitness_channels_2023_06_28.csv")
videos_df = pd.read_csv(path + "/fitness_videos_2023_06_28.csv")

In [3]:
channels_df.head()

Unnamed: 0,ChannelName,ChannelDescription,PublishedDate,TotalSubscribers,TotalViews,TotalVideos,playlistID
0,Chloe Ting,Subscribe to my channel and find weekly workou...,2011-08-17T04:29:09Z,24700000,2980737335,407,UUCgLoMYIyP0U56dEhEL1wXQ
1,blogilates,"Hey guys! My name is Cassey Ho, I am a certifi...",2009-06-13T09:05:48Z,8690000,2820126375,1183,UUIJwWYOfsCfz6PjxbONYXSg
2,MadFit,"This is a place where I post REAL TIME, AT HOM...",2018-03-02T01:46:06Z,8000000,943060836,723,UUpQ34afVgk8cRQBjSJ1xuJQ
3,Rebecca-Louise,"Hey, \n\nWelcome to #TEAMBURN 🙌🏻 \n\nI am so e...",2012-09-22T18:04:00Z,720000,117668198,1257,UUi0AqmA_3DGPFCu5qY0LLSg
4,emi wong,welcome to my channel!\nhope my videos can hel...,2014-11-02T14:43:34Z,6100000,819791658,499,UUvGEK5_U-kLgO6-AMDPeTUQ


In [4]:
videos_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition
0,e7zzES8PeG4,Chloe Ting,Shocking Before After Transformation Results! ...,Check out these amazing before and after trans...,"['Abs', 'Abs results', 'Abs workout results', ...",2023-06-28T14:00:23Z,36033,2677.0,,163.0,PT9M22S,hd
1,AZ1ihabY6bI,Chloe Ting,when you're having a bad day,Cute samoyed doggies in Seoul!!,"['samoyed', 'seoul', 'day in my life', 'doggie...",2023-06-19T14:55:36Z,87568,4739.0,,77.0,PT18S,hd
2,5GLA8MrlDnM,Chloe Ting,A day in my life living in Korea,Short vlog from a day out and about while in S...,"['dayinmylife', 'korea', 'seoul', 'vlog', 'chl...",2023-06-05T14:51:22Z,317017,9618.0,,708.0,PT12M37S,hd
3,IOJ7Fxa8e2Y,Chloe Ting,GROW YOUR BOOTY with these exercises,See the full video here: https://youtu.be/4zuY...,"['glute workout', 'booty workout', 'gym workou...",2023-05-24T15:32:44Z,171898,5484.0,,57.0,PT23S,hd
4,ljNgkSctkXg,Chloe Ting,INTENSE Full Body Workout - 30 Min No Equipment,This is a 30 min full body intense workout fro...,"['workout', 'home workout', 'full body workout...",2023-05-17T14:00:27Z,620735,17223.0,,863.0,PT31M14S,hd


#### Check for duplicate rows

In [5]:
# check for duplicate rows in channels and videos data
(channels_df.duplicated().any(),videos_df.duplicated().any())

(False, False)

#### Check for missing data

In [6]:
channels_df.isnull().any()

ChannelName           False
ChannelDescription    False
PublishedDate         False
TotalSubscribers      False
TotalViews            False
TotalVideos           False
playlistID            False
dtype: bool

In [7]:
videos_df.isnull().any()

video_id          False
channelTitle      False
title             False
description        True
tags               True
publishedAt       False
viewCount         False
likeCount          True
favouriteCount     True
commentCount       True
duration          False
definition        False
dtype: bool

#### Check for percentage of missing values

In [8]:
# Find the percentage of missing values from columns that contains them in the videos dataframe
missingval_columns = videos_df.loc[:, ['description', 'tags', 'likeCount','favouriteCount','commentCount']]
missingval_columns.isnull().sum() / missingval_columns.shape[0] * 100.00

description         4.396954
tags               12.773274
likeCount           1.105380
favouriteCount    100.000000
commentCount        0.245640
dtype: float64

In [9]:
# Drop the favouriteCount column since it only contains missing values
videos_df.drop('favouriteCount', axis=1, inplace=True)

#### Check for redundant columns

In [10]:
# Find the unique values of the definition column
videos_df['definition'].unique()

array(['hd', 'sd'], dtype=object)

In [11]:
# Check the percentage of each unique value in the definition column
videos_df['definition'].value_counts() / videos_df['definition'].shape[0] * 100

hd    99.557848
sd     0.442152
Name: definition, dtype: float64

In [12]:
# Drop the definition column for being uninformative
videos_df.drop('definition', axis=1, inplace=True)

In [13]:
# Check the date values to make sure there are no errors
videos_df.publishedAt.sort_values()

1589    2009-10-06T04:47:37Z
1588    2009-11-04T18:05:19Z
1587    2009-11-10T05:18:29Z
1586    2009-11-18T06:49:44Z
1585    2009-11-30T08:13:39Z
                ...         
1591    2023-06-27T15:39:25Z
2315    2023-06-27T16:08:44Z
1590    2023-06-28T14:00:06Z
0       2023-06-28T14:00:23Z
2314    2023-06-28T14:35:24Z
Name: publishedAt, Length: 4071, dtype: object

#### Checking the column data types

In [14]:
channels_df.dtypes

ChannelName           object
ChannelDescription    object
PublishedDate         object
TotalSubscribers       int64
TotalViews             int64
TotalVideos            int64
playlistID            object
dtype: object

In [15]:
videos_df.dtypes

video_id         object
channelTitle     object
title            object
description      object
tags             object
publishedAt      object
viewCount         int64
likeCount       float64
commentCount    float64
duration         object
dtype: object

## 2. Feature Engineering

In [16]:
# Create publish datetime and year for channels data
channels_df['publishedDatetime'] =  channels_df['PublishedDate'].apply(lambda x: parser.parse(x))
channels_df['publishedYear'] = channels_df['publishedDatetime'].apply(lambda x: int(x.strftime("%Y")))

In [17]:
# Title character length
videos_df['titleLength'] = videos_df['title'].apply(lambda x: len(x))

In [18]:
# Convert duration to seconds
videos_df['durationSecs'] = videos_df['duration'].apply(lambda x: isodate.parse_duration(x))
videos_df['durationSecs'] = videos_df['durationSecs'].astype('timedelta64[s]')

In [19]:
# Create publish year and month (of the year) columns
videos_df['publishedDatetime'] = videos_df['publishedAt'].apply(lambda x: parser.parse(x))
videos_df['publishedDatetime'] = videos_df['publishedAt'].apply(lambda x: parser.parse(x))
videos_df['publishedYear'] = videos_df['publishedDatetime'].apply(lambda x: int(x.strftime("%Y")))
videos_df['publishedMonth'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%b"))

# Create publish day (of the week) and hour and time of day columns
videos_df['pushblishDayName'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%a")) 
videos_df['publishedHour'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%H"))
videos_df['publishedToD'] = videos_df['publishedDatetime'].apply(lambda x: x.strftime("%p"))

In [20]:
# Create the number of tags column
videos_df['tags'] = videos_df['tags'].replace(np.nan, None)
videos_df['tags'] = videos_df['tags'].apply(lambda x: x if x is None else ast.literal_eval(x))
videos_df['tagsCount'] = videos_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [21]:
# Create column to categorize Youtube Shorts
videos_df['youtubeShorts'] = [True if x <= 60 else False for x in videos_df['durationSecs']]

#### Natural Language Processing of text (title column) to enrich the data

The goal is to use spaCy's natural language processing tools to derive new useful features from the video title. We are going to do this by using regular expression with spaCy's entity ruler. 
The features we are planning to derive are: workout time, workout type, and the different workout 'classifications'.


In [22]:
# Subset the videos data to filter out Youtube Shorts
workout_videos_df = videos_df[videos_df['youtubeShorts'] == False].copy()

In [23]:
# Download spacY's small english model
nlp = spacy.load("en_core_web_sm")

# Create and add the EntityRuler
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [24]:
#List of Entities and Patterns
patterns = [
    {"label": "WORKOUT_TIME", "pattern": [{"TEXT": {"REGEX": r"^(\d+)"}},
                                          {"LOWER":{"REGEX": r"^(min|mins|minute|minutes|hour|hours|hr|hrs)$"}}
                                         ]},
    {"label": "FULL_BODY", "pattern": [{"LOWER": {"REGEX": r"(full|total|whole)"}}, {"LOWER": "body"}]},
    {"label": "UPPER_BODY", "pattern": [{"LOWER": "upper"}, {"LOWER": "body"}]},
    {"label": "LOWER_BODY", "pattern": [{"LOWER": "lower"}, {"LOWER": "body"}]},
    {"label": "CHEST_BACK", "pattern": [{"LOWER": {"REGEX": r"(back|chest)"}}, {"ORTH": {"REGEX": r"(and|&)"}} ,{"LOWER": {"REGEX": r"(back|chest)"}}]},
    {"label": "ABS", "pattern": [{"LOWER": {"REGEX": r"(core|ab|abs|plank)"}}]},
    {"label": "ARMS", "pattern": [{"LOWER": {"REGEX": r"arms?"}}]},
    {"label": "LEGS", "pattern": [{"LOWER": {"REGEX": r"(thigh|thighs|leg|legs)"}}]},
    {"label": "GLUTES", "pattern": [{"LOWER": {"REGEX": r"(booty|glute|glutes|butt)"}}]},
    {"label": "WORKOUT_TYPE", "pattern": [{"LOWER": {"REGEX": r"(hiit|cardio|pilates|yoga|dance|tabata|barre|stretch)"}}]},
    {"label": "STANDING" , "pattern": [{"LOWER": "standing"}]},
    {"label": "NO_EQUIPMENT", "pattern": [{"LOWER": "no", "LOWER": {"REGEX": r"(equip|equipment|equipments|weight|weights)"}}]},
    {"label": "NO_JUMPING", "pattern": [{"LOWER": "no", "LOWER": "jumping"}]},
    {"label": "LOW_IMPACT", "pattern": [{"LOWER": "low", "LOWER": "impact"}]},
    {"label": "STRENGTH_TRAINING", "pattern": [{"LOWER": {"REGEX": r"(strength|sculpt|sculpting|tone|toning|toned)"}}]}
]

In [25]:
ruler.add_patterns(patterns)

In [26]:
# Extract workout time or type using regular expressions
def extract_ent_text(title, label):
    doc = nlp(title)
    workout_label = None
    for ent in doc.ents:
        if ent.label_ == label:
            workout_label = ent.text
            break
    return workout_label

# Function to extract body part from a title
def extract_body_part(title):
    doc = nlp(title)
    body_part = None
    body_parts = ["FULL_BODY","UPPER_BODY","LOWER_BODY","CHEST_BACK","ARMS","ABS","LEGS","GLUTES"]
    for ent in doc.ents:
        if ent.label_ in body_parts:
            body_part = ent.label_
            break
    return body_part

# Extract workout using regular expressions
def extract_ent_label(title, label):
    doc = nlp(title)
    workout_label = None
    for ent in doc.ents:
        if ent.label_ == label:
            workout_label = ent.label_
            break
    return workout_label


In [27]:
# Create workout time and workout type column by applying function to extract text of entities
workout_videos_df['workoutTime'] = workout_videos_df['title'].apply(lambda x: extract_ent_text(x,"WORKOUT_TIME"))
workout_videos_df['workoutType'] = workout_videos_df['title'].apply(lambda x: extract_ent_text(x, "WORKOUT_TYPE"))

# Apply the function to create a body part column 
workout_videos_df['bodyPart'] = workout_videos_df['title'].apply(extract_body_part)

# Create new columns by applying function to extract the label of entities
labels_to_extract = ["STANDING", "NO_EQUIPMENT", "NO_JUMPING", "LOW_IMPACT", "STRENGTH_TRAINING"]
workout_features_df = pd.DataFrame(columns=['standingWorkout','noEquipment','noJumping','lowImpact','strengthTraining'])

for i in range(len(labels_to_extract)):
    workout_features_df.iloc[:,i] = workout_videos_df['title'].apply(lambda x: extract_ent_label(x,labels_to_extract[i]))

# Add the new columns to the dataframe of workout videos
workout_videos_df = pd.concat([workout_videos_df,workout_features_df], axis=1)

### Save the processed data

In [28]:
# Get the current working directory; should be the top level project folder
cwd = os.path.dirname(os.getcwd())

# Create file path of where the processed data will be stored
path = cwd + "/data/processed"

# Save dataframes as csv files 
channels_df.to_csv(path + "/fitness_channels_processed_2023_06_28.csv", index=False)
workout_videos_df.to_csv(path + "/fitness_videos_processed_2023_06_28.csv", index=False)