# Data Cleaning and Preprocessing

In [52]:
!pip install TextBlob



In [53]:
import pandas as pd
import numpy as np
import isodate
from isodate import parse_duration
from textblob import TextBlob

In [54]:
video_df = pd.read_csv("dataFolder/raw/redPillAnalytics.csv")

In [55]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description        True
tags               True
publishedAt       False
viewCount         False
likeCount          True
favouriteCount     True
commentCount       True
duration          False
definition        False
caption           False
dtype: bool

In [56]:
video_df.dtypes

video_id           object
channelTitle       object
title              object
description        object
tags               object
publishedAt        object
viewCount           int64
likeCount         float64
favouriteCount    float64
commentCount      float64
duration           object
definition         object
caption              bool
dtype: object

In [57]:
video_df.isnull().sum()
video_df.dropna()
video_df.isin([0, np.nan]).sum()

video_id              0
channelTitle          0
title                 0
description         394
tags               6314
publishedAt           0
viewCount             3
likeCount            23
favouriteCount    13675
commentCount        150
duration              0
definition            0
caption           13246
dtype: int64

In [58]:
video_df.isnull().sum()

video_id              0
channelTitle          0
title                 0
description         394
tags               6314
publishedAt           0
viewCount             0
likeCount            16
favouriteCount    13675
commentCount          9
duration              0
definition            0
caption               0
dtype: int64

In [59]:
video_df['description'] = video_df['description'].fillna("")
video_df['tags'] = video_df['tags'].apply(lambda x: x if isinstance(x, list) else [])
video_df['likeCount'] = video_df['likeCount'].fillna(0)
video_df['favouriteCount'] = video_df['favouriteCount'].fillna(0)
video_df['commentCount'] = video_df['commentCount'].fillna(0)

print(video_df.isnull().any())

video_id          False
channelTitle      False
title             False
description       False
tags              False
publishedAt       False
viewCount         False
likeCount         False
favouriteCount    False
commentCount      False
duration          False
definition        False
caption           False
dtype: bool


In [60]:
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

In [61]:
video_df['publishedAt'] = pd.to_datetime(video_df['publishedAt'])
video_df['publishDayName'] = video_df['publishedAt'].dt.strftime("%A")
video_df['publishedAt_timestamp'] = video_df['publishedAt'].astype(int) / 10**9
video_df['tagCount'] = video_df['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

In [62]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,publishDayName,publishedAt_timestamp,tagCount,durationSecs,titleLength
0,F5eSaabAAmk,Benjamin Seda,How to ACTUALLY Get a Girlfriend in 2025 (Full...,👉🏼 Get 1-3+ dates per week in 30 days (coachin...,[],2025-03-06 15:27:49+00:00,5034.0,254.0,0.0,27.0,PT15M4S,hd,False,Thursday,1.741275e+09,0,0 days 00:15:04,53
1,xJ6b8CV-pQ0,Benjamin Seda,How to Find A 10/10 Girlfriend,👫 My 3 step formula to approach & attract wome...,[],2025-03-03 15:01:24+00:00,3346.0,330.0,0.0,22.0,PT59S,hd,False,Monday,1.741014e+09,0,0 days 00:00:59,30
2,kPhrei5S88U,Benjamin Seda,The Mistake 99% of Men Make That Keep Them Single,👫 My 3 step formula to approach & attract wome...,[],2025-03-01 14:45:07+00:00,2690.0,222.0,0.0,19.0,PT36S,hd,False,Saturday,1.740840e+09,0,0 days 00:00:36,49
3,4ZnwTwLcAeM,Benjamin Seda,How to Always Get That 2nd Date,👫 My 3 step formula to approach & attract wome...,[],2025-02-27 14:15:00+00:00,4060.0,413.0,0.0,9.0,PT46S,hd,False,Thursday,1.740666e+09,0,0 days 00:00:46,31
4,VW9-SBs6yIg,Benjamin Seda,The Donald Trump Method for Tinder (STEAL THIS),👫 My 3 step formula to approach & attract wome...,[],2025-02-26 13:45:03+00:00,6818.0,316.0,0.0,30.0,PT32S,hd,False,Wednesday,1.740578e+09,0,0 days 00:00:32,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13670,_X4P6L622-8,The Corbett Report (Unofficial),How Do I Defend Voluntarism? - Questions For C...,I have no affiliation with James Corbett or Th...,[],2022-05-25 13:22:06+00:00,393.0,15.0,0.0,2.0,PT28M13S,hd,False,Wednesday,1.653485e+09,0,0 days 00:28:13,52
13671,tkmZ4c2AOVY,The Corbett Report (Unofficial),The 5G Dragnet,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:28:15+00:00,1226.0,61.0,0.0,2.0,PT25M44S,sd,False,Tuesday,1.653413e+09,0,0 days 00:25:44,14
13672,mr7itEUIVew,The Corbett Report (Unofficial),False Flags: The Secret History of Al Qaeda — ...,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:01:10+00:00,5531.0,179.0,0.0,16.0,PT1H16M19S,sd,False,Tuesday,1.653412e+09,0,0 days 01:16:19,66
13673,ochRNyIDTE8,The Corbett Report (Unofficial),Episode 409 - False Flags: The Secret History ...,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:01:06+00:00,6132.0,212.0,0.0,16.0,PT1H59M39S,sd,False,Tuesday,1.653412e+09,0,0 days 01:59:39,72


In [63]:
channel_stats = video_df.groupby('channelTitle').agg({
    'viewCount': 'sum',
    'likeCount': 'sum',
    'commentCount': 'sum'
}).reset_index()

In [64]:
channel_stats['engagementRate'] = (channel_stats['likeCount'] + channel_stats['commentCount']) / channel_stats['viewCount']

In [65]:
video_df.groupby('channelTitle')['durationSecs'].mean()

channelTitle
Alhpamales                        0 days 00:00:24
Benjamin Seda                     0 days 00:03:15
Better Bachelor                   0 days 00:22:59
Coach Corey Wayne                 0 days 00:10:59
FreshandFit                       0 days 01:01:46
Jordan B Peterson                 0 days 00:49:00
Paul Joseph Watson | Перевод      0 days 00:01:53
The Corbett Report (Unofficial)   0 days 00:38:00
The Distributist                  0 days 01:29:52
Name: durationSecs, dtype: timedelta64[s]

In [67]:
video_df['view_per_like'] = video_df['viewCount'] / video_df['likeCount']
video_df['comment_duration_interaction'] = video_df['commentCount'] * video_df['durationSecs']
video_df['view_per_like'] = video_df['view_per_like'].fillna(0)
video_df['popularity_score'] = video_df['viewCount'] + video_df['likeCount'] * 10 + video_df['commentCount'] * 20
video_df['commentRatio'] = video_df['commentCount'] / video_df['viewCount']
video_df['likeRatio'] = video_df['likeCount'] / video_df['viewCount']

In [68]:
video_df['title_sentiment'] = video_df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [69]:
video_df['commentCount'].mean()

595.6180621572212

In [70]:
video_df[['viewCount', 'commentCount']].corr()

Unnamed: 0,viewCount,commentCount
viewCount,1.0,0.618975
commentCount,0.618975,1.0


In [71]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,...,publishedAt_timestamp,tagCount,durationSecs,titleLength,view_per_like,comment_duration_interaction,popularity_score,commentRatio,likeRatio,title_sentiment
0,F5eSaabAAmk,Benjamin Seda,How to ACTUALLY Get a Girlfriend in 2025 (Full...,👉🏼 Get 1-3+ dates per week in 30 days (coachin...,[],2025-03-06 15:27:49+00:00,5034.0,254.0,0.0,27.0,...,1.741275e+09,0,0 days 00:15:04,53,19.818898,0 days 06:46:48,8114.0,0.005364,0.050457,0.175000
1,xJ6b8CV-pQ0,Benjamin Seda,How to Find A 10/10 Girlfriend,👫 My 3 step formula to approach & attract wome...,[],2025-03-03 15:01:24+00:00,3346.0,330.0,0.0,22.0,...,1.741014e+09,0,0 days 00:00:59,30,10.139394,0 days 00:21:38,7086.0,0.006575,0.098625,0.000000
2,kPhrei5S88U,Benjamin Seda,The Mistake 99% of Men Make That Keep Them Single,👫 My 3 step formula to approach & attract wome...,[],2025-03-01 14:45:07+00:00,2690.0,222.0,0.0,19.0,...,1.740840e+09,0,0 days 00:00:36,49,12.117117,0 days 00:11:24,5290.0,0.007063,0.082528,-0.071429
3,4ZnwTwLcAeM,Benjamin Seda,How to Always Get That 2nd Date,👫 My 3 step formula to approach & attract wome...,[],2025-02-27 14:15:00+00:00,4060.0,413.0,0.0,9.0,...,1.740666e+09,0,0 days 00:00:46,31,9.830508,0 days 00:06:54,8370.0,0.002217,0.101724,0.000000
4,VW9-SBs6yIg,Benjamin Seda,The Donald Trump Method for Tinder (STEAL THIS),👫 My 3 step formula to approach & attract wome...,[],2025-02-26 13:45:03+00:00,6818.0,316.0,0.0,30.0,...,1.740578e+09,0,0 days 00:00:32,47,21.575949,0 days 00:16:00,10578.0,0.004400,0.046348,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13670,_X4P6L622-8,The Corbett Report (Unofficial),How Do I Defend Voluntarism? - Questions For C...,I have no affiliation with James Corbett or Th...,[],2022-05-25 13:22:06+00:00,393.0,15.0,0.0,2.0,...,1.653485e+09,0,0 days 00:28:13,52,26.200000,0 days 00:56:26,583.0,0.005089,0.038168,0.000000
13671,tkmZ4c2AOVY,The Corbett Report (Unofficial),The 5G Dragnet,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:28:15+00:00,1226.0,61.0,0.0,2.0,...,1.653413e+09,0,0 days 00:25:44,14,20.098361,0 days 00:51:28,1876.0,0.001631,0.049755,0.000000
13672,mr7itEUIVew,The Corbett Report (Unofficial),False Flags: The Secret History of Al Qaeda — ...,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:01:10+00:00,5531.0,179.0,0.0,16.0,...,1.653412e+09,0,0 days 01:16:19,66,30.899441,0 days 20:21:04,7641.0,0.002893,0.032363,-0.400000
13673,ochRNyIDTE8,The Corbett Report (Unofficial),Episode 409 - False Flags: The Secret History ...,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:01:06+00:00,6132.0,212.0,0.0,16.0,...,1.653412e+09,0,0 days 01:59:39,72,28.924528,1 days 07:54:24,8572.0,0.002609,0.034573,-0.400000


In [72]:
channel_stats

Unnamed: 0,channelTitle,viewCount,likeCount,commentCount,engagementRate
0,Alhpamales,239.0,10.0,0.0,0.041841
1,Benjamin Seda,202773449.0,6456129.0,356756.0,0.033599
2,Better Bachelor,231797826.0,12936057.0,2997777.0,0.06874
3,Coach Corey Wayne,251196988.0,4876107.0,396843.0,0.020991
4,FreshandFit,221150653.0,9730118.0,819214.0,0.047702
5,Jordan B Peterson,950219566.0,31148819.0,3469047.0,0.036431
6,Paul Joseph Watson | Перевод,2178.0,44.0,0.0,0.020202
7,The Corbett Report (Unofficial),1124728.0,84132.0,12672.0,0.086069
8,The Distributist,6873745.0,248340.0,92768.0,0.049625


In [73]:
top_10_views = video_df.nlargest(10, 'viewCount')[['title', 'viewCount', 'channelTitle']]
print(top_10_views)

                                                   title   viewCount  \
9910           JBP X @MattRifeComedy.  Today at 5pm EST.  18502980.0   
10451  Lecture: Biblical Series I: Introduction to th...  13706650.0   
1463   THIS is How A Girl Wants You to TEXT HER | How...  11807649.0   
1473   7 Ways To INSTANTLY Look MORE ATTRACTIVE | How...   8246244.0   
9984                             COVID-19 Cause of Death   7952713.0   
10053  Talking to Muslims About Christ | Mohammed Hij...   7756906.0   
9973                            What Are Women Good For?   7656196.0   
10072         Africa is Not Poor Because of Colonization   7541337.0   
10389  Documentary: A Glitch in the Matrix (David Ful...   6605578.0   
90             If a Girl is Looking at You, Approach Her   6594217.0   

            channelTitle  
9910   Jordan B Peterson  
10451  Jordan B Peterson  
1463       Benjamin Seda  
1473       Benjamin Seda  
9984   Jordan B Peterson  
10053  Jordan B Peterson  
9973   Jordan B Pe

In [74]:
top_10_likes = video_df.nlargest(10, 'likeCount')[['title', 'likeCount', 'channelTitle']]
print(top_10_likes)

                                                   title  likeCount  \
9910           JBP X @MattRifeComedy.  Today at 5pm EST.   611913.0   
10290                                        Return Home   543720.0   
9955   The Fight Against Worldwide Child Slavery & th...   318009.0   
10110                               Article: Twitter Ban   315518.0   
9973                            What Are Women Good For?   305820.0   
9984                             COVID-19 Cause of Death   299559.0   
10087    Language Is Used as a Group Protection Strategy   232440.0   
114                 If A Girl is Looking at You, Do This   231402.0   
10451  Lecture: Biblical Series I: Introduction to th...   227120.0   
10072         Africa is Not Poor Because of Colonization   220386.0   

            channelTitle  
9910   Jordan B Peterson  
10290  Jordan B Peterson  
9955   Jordan B Peterson  
10110  Jordan B Peterson  
9973   Jordan B Peterson  
9984   Jordan B Peterson  
10087  Jordan B Peterson  
11

In [75]:
video_df.to_csv("dataFolder/processed/cleanedDataFrame.csv", index=False)