# Data Cleaning and Preprocessing

In [1]:
!pip install TextBlob

[0m

In [2]:
import pandas as pd
import numpy as np
import isodate
import ast
from isodate import parse_duration
from textblob import TextBlob

In [3]:
video_df = pd.read_csv("dataFolder/raw/redPillAnalytics.csv")

In [4]:
df1 = pd.read_csv("dataFolder/raw/commentsBatchOne_df.csv")
df2 = pd.read_csv("dataFolder/raw/commentsBatchTwo_df.csv")
comments_df = pd.concat([df1, df2], ignore_index=True)

In [5]:
df_filtered = video_df[['video_id', 'channelTitle']]

In [6]:
comments_df = df_filtered.merge(comments_df, on='video_id', how='inner').copy()
comments_df

Unnamed: 0,video_id,channelTitle,comment,published_at
0,F5eSaabAAmk,Benjamin Seda,big boobs lmao,2025-03-10T00:36:43Z
1,F5eSaabAAmk,Benjamin Seda,"This will work for a specific type of woman, o...",2025-03-09T23:01:56Z
2,F5eSaabAAmk,Benjamin Seda,Can you do a video on what to do if you enco...,2025-03-09T07:13:54Z
3,F5eSaabAAmk,Benjamin Seda,God of the Dates 🤍,2025-03-08T14:57:50Z
4,F5eSaabAAmk,Benjamin Seda,"About cold approaches, it's just not true. I d...",2025-03-07T19:06:08Z
...,...,...,...,...
668718,Jyjqw_HwXVg,The Corbett Report (Unofficial),"Make everyone you know, or are even slightly r...",2022-05-25T12:19:52Z
668719,Jyjqw_HwXVg,The Corbett Report (Unofficial),The Green Scheme is Green Death!,2022-05-25T12:15:15Z
668720,Jyjqw_HwXVg,The Corbett Report (Unofficial),"a real timeless classic, perfect choice",2022-05-23T18:30:31Z
668721,Jyjqw_HwXVg,The Corbett Report (Unofficial),This is great stuff,2022-05-23T16:13:58Z


In [7]:
comments_df.isnull().any()

video_id        False
channelTitle    False
comment          True
published_at    False
dtype: bool

In [8]:
comments_df.dtypes

video_id        object
channelTitle    object
comment         object
published_at    object
dtype: object

In [9]:
comments_df.isnull().sum()
comments_df.dropna()
comments_df.isin([0, np.nan]).sum()

video_id         0
channelTitle     0
comment         51
published_at     0
dtype: int64

In [10]:
comments_df = comments_df.dropna(subset=['comment'])
print(comments_df.isnull().sum())  # Verify no nulls in 'comment'

video_id        0
channelTitle    0
comment         0
published_at    0
dtype: int64


In [11]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description        True
tags               True
publishedAt       False
viewCount         False
likeCount          True
favouriteCount     True
commentCount       True
duration          False
definition        False
caption           False
dtype: bool

In [12]:
video_df.dtypes

video_id           object
channelTitle       object
title              object
description        object
tags               object
publishedAt        object
viewCount           int64
likeCount         float64
favouriteCount    float64
commentCount      float64
duration           object
definition         object
caption              bool
dtype: object

In [13]:
video_df.isnull().sum()
video_df.dropna()
video_df.isin([0, np.nan]).sum()

video_id              0
channelTitle          0
title                 0
description         394
tags               6314
publishedAt           0
viewCount             3
likeCount            23
favouriteCount    13675
commentCount        150
duration              0
definition            0
caption           13246
dtype: int64

In [14]:
video_df.isnull().sum()

video_id              0
channelTitle          0
title                 0
description         394
tags               6314
publishedAt           0
viewCount             0
likeCount            16
favouriteCount    13675
commentCount          9
duration              0
definition            0
caption               0
dtype: int64

In [15]:
video_df['description'] = video_df['description'].fillna("")

# Convert string representations of tags to lists and fill missing values with empty lists
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except (ValueError, SyntaxError):
        return []

video_df['tags'] = video_df['tags'].apply(safe_literal_eval)

# Fill missing likeCount, favouriteCount, and commentCount with 0
video_df['likeCount'] = video_df['likeCount'].fillna(0)
video_df['favouriteCount'] = video_df['favouriteCount'].fillna(0)
video_df['commentCount'] = video_df['commentCount'].fillna(0)

# Verify no null values remain
print(video_df.isnull().sum())

video_id          0
channelTitle      0
title             0
description       0
tags              0
publishedAt       0
viewCount         0
likeCount         0
favouriteCount    0
commentCount      0
duration          0
definition        0
caption           0
dtype: int64


In [16]:
print(video_df['tags'].head())

0    [how to flirt with a girl, dates, how to get a...
1    [how to flirt with a girl, dates, how to get a...
2    [how to flirt with a girl, dates, how to get a...
3    [how to flirt with a girl, dates, how to get a...
4    [how to flirt with a girl, dates, how to get a...
Name: tags, dtype: object


In [17]:
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

In [18]:
# Convert publishedAt to datetime
video_df['publishedAt'] = pd.to_datetime(video_df['publishedAt']).dt.tz_localize(None)

# Extract day of the week
video_df['publishDayName'] = video_df['publishedAt'].dt.strftime("%A")

# Convert publishedAt to Unix timestamp
video_df['publishedAt_timestamp'] = video_df['publishedAt'].astype('int64') / 10**9

# Count tags
video_df['tagCount'] = video_df['tags'].apply(lambda x: len(x) if isinstance(x, list) else [])

# Convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())

video_df['durationSecs'] = video_df['duration'].apply(
    lambda x: isodate.parse_duration(x).total_seconds() if pd.notnull(x) else 0
)

# Calculate title length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

In [19]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,publishDayName,publishedAt_timestamp,tagCount,durationSecs,titleLength
0,F5eSaabAAmk,Benjamin Seda,How to ACTUALLY Get a Girlfriend in 2025 (Full...,👉🏼 Get 1-3+ dates per week in 30 days (coachin...,"[how to flirt with a girl, dates, how to get a...",2025-03-06 15:27:49,5034.0,254.0,0.0,27.0,PT15M4S,hd,False,Thursday,1.741275e+09,21,904.0,53
1,xJ6b8CV-pQ0,Benjamin Seda,How to Find A 10/10 Girlfriend,👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-03-03 15:01:24,3346.0,330.0,0.0,22.0,PT59S,hd,False,Monday,1.741014e+09,13,59.0,30
2,kPhrei5S88U,Benjamin Seda,The Mistake 99% of Men Make That Keep Them Single,👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-03-01 14:45:07,2690.0,222.0,0.0,19.0,PT36S,hd,False,Saturday,1.740840e+09,13,36.0,49
3,4ZnwTwLcAeM,Benjamin Seda,How to Always Get That 2nd Date,👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-02-27 14:15:00,4060.0,413.0,0.0,9.0,PT46S,hd,False,Thursday,1.740666e+09,13,46.0,31
4,VW9-SBs6yIg,Benjamin Seda,The Donald Trump Method for Tinder (STEAL THIS),👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-02-26 13:45:03,6818.0,316.0,0.0,30.0,PT32S,hd,False,Wednesday,1.740578e+09,13,32.0,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13670,_X4P6L622-8,The Corbett Report (Unofficial),How Do I Defend Voluntarism? - Questions For C...,I have no affiliation with James Corbett or Th...,[],2022-05-25 13:22:06,393.0,15.0,0.0,2.0,PT28M13S,hd,False,Wednesday,1.653485e+09,0,1693.0,52
13671,tkmZ4c2AOVY,The Corbett Report (Unofficial),The 5G Dragnet,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:28:15,1226.0,61.0,0.0,2.0,PT25M44S,sd,False,Tuesday,1.653413e+09,0,1544.0,14
13672,mr7itEUIVew,The Corbett Report (Unofficial),False Flags: The Secret History of Al Qaeda — ...,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:01:10,5531.0,179.0,0.0,16.0,PT1H16M19S,sd,False,Tuesday,1.653412e+09,0,4579.0,66
13673,ochRNyIDTE8,The Corbett Report (Unofficial),Episode 409 - False Flags: The Secret History ...,I have no affiliation with James Corbett or Th...,[],2022-05-24 17:01:06,6132.0,212.0,0.0,16.0,PT1H59M39S,sd,False,Tuesday,1.653412e+09,0,7179.0,72


In [20]:
channel_stats = video_df.groupby('channelTitle').agg({
    'viewCount': 'sum',
    'likeCount': 'sum',
    'commentCount': 'sum'
}).reset_index()

In [21]:
channel_stats['engagementRate'] = (channel_stats['likeCount'] + channel_stats['commentCount']) / channel_stats['viewCount']

In [22]:
channel_stats

Unnamed: 0,channelTitle,viewCount,likeCount,commentCount,engagementRate
0,Alhpamales,239.0,10.0,0.0,0.041841
1,Benjamin Seda,202773449.0,6456129.0,356756.0,0.033599
2,Better Bachelor,231797826.0,12936057.0,2997777.0,0.06874
3,Coach Corey Wayne,251196988.0,4876107.0,396843.0,0.020991
4,FreshandFit,221150653.0,9730118.0,819214.0,0.047702
5,Jordan B Peterson,950219566.0,31148819.0,3469047.0,0.036431
6,Paul Joseph Watson | Перевод,2178.0,44.0,0.0,0.020202
7,The Corbett Report (Unofficial),1124728.0,84132.0,12672.0,0.086069
8,The Distributist,6873745.0,248340.0,92768.0,0.049625


## Breakdown of Channels

| Channel                        | View Count   | Engagement Rate | Relevance to Red Pill                              |
|--------------------------------|--------------|-----------------|---------------------------------------------------|
| Benjamin Seda                  | 202,773,449  | 3.36%           | Likely relevant (focus on dating, masculinity, and self-improvement) |
| Better Bachelor                | 231,797,826  | 6.87%           | Highly relevant (focus on men’s rights, dating, and anti-feminism) |
| Coach Corey Wayne              | 251,196,988  | 2.10%           | Relevant (focus on dating advice and relationships) |
| FreshandFit                    | 221,150,653  | 4.77%           | Highly relevant (focus on dating, gender dynamics, and masculinity) |
| Jordan B Peterson              | 950,219,566  | 3.64%           | Partially relevant (focus on psychology, self-improvement, and gender roles) |
| The Corbett Report (Unofficial)| 1,124,728    | 8.61%           | Less relevant (focus on conspiracy theories, not directly Red Pill) |
| The Distributist               | 6,873,745    | 4.96%           | Less relevant (focus on traditionalism and economics, not directly Red Pill) |
| Alhpamales                     | 239          | 4.18%           | Likely irrelevant (very low view count, unclear relevance) |
| Paul Joseph Watson | Перевод     | 2,178          | 2.02%           | Likely irrelevant (low view count, unclear relevance) |

## Recommendations

### Channels to Keep (Highly Relevant to Red Pill):
- **Better Bachelor**
- **FreshandFit**
- **Benjamin Seda**
- **Coach Corey Wayne**
- **Jordan B Peterson** (if you consider his content relevant to Red Pill themes)

### Channels to Exclude (Less Relevant or Low Engagement):
- **The Corbett Report (Unofficial)** (focuses on conspiracy theories, not Red Pill)
- **The Distributist** (focuses on traditionalism/economics, not Red Pill)
- **Alhpamales** and **Paul Joseph Watson | Перевод** (very low engagement, unclear relevance)

In [23]:
excluded_channels = [
    "The Corbett Report (Unofficial)", 
    "The Distributist", 
    "Alhpamales", 
    "Paul Joseph Watson | Перевод"
]

video_df = video_df[~video_df["channelTitle"].isin(excluded_channels)].copy()

In [19]:
comments_df = comments_df[~comments_df["channelTitle"].isin(excluded_channels)].copy()

In [24]:
video_df.groupby('channelTitle')['durationSecs'].mean()

channelTitle
Benjamin Seda         195.985205
Better Bachelor      1379.782349
Coach Corey Wayne     659.063435
FreshandFit          3706.578228
Jordan B Peterson    2940.362724
Name: durationSecs, dtype: float64

In [25]:
# Calculate view_per_like (handle division by zero)
video_df['view_per_like'] = np.where(
    video_df['likeCount'] != 0,  # Condition: likeCount is not zero
    video_df['viewCount'] / video_df['likeCount'],  # True: Perform division
    0  # False: Set to 0 if likeCount is zero
)

# Calculate comment_duration_interaction
video_df['comment_duration_interaction'] = video_df['commentCount'] * video_df['durationSecs']

# Calculate popularity_score
video_df['popularity_score'] = (
    video_df['viewCount'] + 
    video_df['likeCount'] * 10 + 
    video_df['commentCount'] * 20
)

# Calculate commentRatio (handle division by zero)
video_df['commentRatio'] = np.where(
    video_df['viewCount'] != 0,  # Condition: viewCount is not zero
    video_df['commentCount'] / video_df['viewCount'],  # True: Perform division
    0  # False: Set to 0 if viewCount is zero
)

# Calculate likeRatio (handle division by zero)
video_df['likeRatio'] = np.where(
    video_df['viewCount'] != 0,  # Condition: viewCount is not zero
    video_df['likeCount'] / video_df['viewCount'],  # True: Perform division
    0  # False: Set to 0 if viewCount is zero
)

# Fill any remaining NaN values with 0 (if needed)
video_df = video_df.fillna(0)

In [26]:
video_df['title_sentiment'] = video_df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [27]:
video_df['commentCount'].mean()

623.0344854308742

In [28]:
video_df[['viewCount', 'commentCount']].corr()

Unnamed: 0,viewCount,commentCount
viewCount,1.0,0.617668
commentCount,0.617668,1.0


In [29]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,...,publishedAt_timestamp,tagCount,durationSecs,titleLength,view_per_like,comment_duration_interaction,popularity_score,commentRatio,likeRatio,title_sentiment
0,F5eSaabAAmk,Benjamin Seda,How to ACTUALLY Get a Girlfriend in 2025 (Full...,👉🏼 Get 1-3+ dates per week in 30 days (coachin...,"[how to flirt with a girl, dates, how to get a...",2025-03-06 15:27:49,5034.0,254.0,0.0,27.0,...,1.741275e+09,21,904.0,53,19.818898,24408.0,8114.0,0.005364,0.050457,0.175000
1,xJ6b8CV-pQ0,Benjamin Seda,How to Find A 10/10 Girlfriend,👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-03-03 15:01:24,3346.0,330.0,0.0,22.0,...,1.741014e+09,13,59.0,30,10.139394,1298.0,7086.0,0.006575,0.098625,0.000000
2,kPhrei5S88U,Benjamin Seda,The Mistake 99% of Men Make That Keep Them Single,👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-03-01 14:45:07,2690.0,222.0,0.0,19.0,...,1.740840e+09,13,36.0,49,12.117117,684.0,5290.0,0.007063,0.082528,-0.071429
3,4ZnwTwLcAeM,Benjamin Seda,How to Always Get That 2nd Date,👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-02-27 14:15:00,4060.0,413.0,0.0,9.0,...,1.740666e+09,13,46.0,31,9.830508,414.0,8370.0,0.002217,0.101724,0.000000
4,VW9-SBs6yIg,Benjamin Seda,The Donald Trump Method for Tinder (STEAL THIS),👫 My 3 step formula to approach & attract wome...,"[how to flirt with a girl, dates, how to get a...",2025-02-26 13:45:03,6818.0,316.0,0.0,30.0,...,1.740578e+09,13,32.0,47,21.575949,960.0,10578.0,0.004400,0.046348,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12917,KO7Z0HdxIek,FreshandFit,The Most OPTIMAL Rep Range For STRENGTH,Many people struggle to determine the best rep...,"[repetition range, how to lift weights, how to...",2020-06-25 00:36:00,9633.0,601.0,0.0,21.0,...,1.593045e+09,23,258.0,39,16.028286,5418.0,16063.0,0.002180,0.062390,0.500000
12918,3LXo6A-JnV4,FreshandFit,What's better for fat loss? Low carb or high c...,Today we answer the age old question. Are high...,"[fatloss, lowcarb, weight loss, keto, evidence...",2020-06-20 14:45:11,9209.0,490.0,0.0,26.0,...,1.592664e+09,23,193.0,50,18.793878,5018.0,14629.0,0.002823,0.053209,0.220000
12919,e9Gdl-szTg4,FreshandFit,Are Fitness/Calorie Tracking Apps Accurate? Th...,Are these popular apps bringing you closer to ...,[redpill fitness hypergamy #gainz],2020-06-13 15:00:11,4787.0,242.0,0.0,10.0,...,1.592060e+09,1,271.0,66,19.780992,2710.0,7407.0,0.002089,0.050554,-0.163889
12920,RHlPDYsuBYs,FreshandFit,IS FASTING SUPERIOR? What the science says...,Does fasting build more muscle or help burn mo...,"[fasting, fitness, aesthetic]",2020-05-30 15:00:27,23952.0,1353.0,0.0,64.0,...,1.590851e+09,3,426.0,45,17.702882,27264.0,38762.0,0.002672,0.056488,0.700000


In [30]:
top_10_views = video_df.nlargest(10, 'viewCount')[['title', 'viewCount', 'channelTitle']]
print(top_10_views)

                                                   title   viewCount  \
9910           JBP X @MattRifeComedy.  Today at 5pm EST.  18502980.0   
10451  Lecture: Biblical Series I: Introduction to th...  13706650.0   
1463   THIS is How A Girl Wants You to TEXT HER | How...  11807649.0   
1473   7 Ways To INSTANTLY Look MORE ATTRACTIVE | How...   8246244.0   
9984                             COVID-19 Cause of Death   7952713.0   
10053  Talking to Muslims About Christ | Mohammed Hij...   7756906.0   
9973                            What Are Women Good For?   7656196.0   
10072         Africa is Not Poor Because of Colonization   7541337.0   
10389  Documentary: A Glitch in the Matrix (David Ful...   6605578.0   
90             If a Girl is Looking at You, Approach Her   6594217.0   

            channelTitle  
9910   Jordan B Peterson  
10451  Jordan B Peterson  
1463       Benjamin Seda  
1473       Benjamin Seda  
9984   Jordan B Peterson  
10053  Jordan B Peterson  
9973   Jordan B Pe

In [31]:
top_10_likes = video_df.nlargest(10, 'likeCount')[['title', 'likeCount', 'channelTitle']]
print(top_10_likes)

                                                   title  likeCount  \
9910           JBP X @MattRifeComedy.  Today at 5pm EST.   611913.0   
10290                                        Return Home   543720.0   
9955   The Fight Against Worldwide Child Slavery & th...   318009.0   
10110                               Article: Twitter Ban   315518.0   
9973                            What Are Women Good For?   305820.0   
9984                             COVID-19 Cause of Death   299559.0   
10087    Language Is Used as a Group Protection Strategy   232440.0   
114                 If A Girl is Looking at You, Do This   231402.0   
10451  Lecture: Biblical Series I: Introduction to th...   227120.0   
10072         Africa is Not Poor Because of Colonization   220386.0   

            channelTitle  
9910   Jordan B Peterson  
10290  Jordan B Peterson  
9955   Jordan B Peterson  
10110  Jordan B Peterson  
9973   Jordan B Peterson  
9984   Jordan B Peterson  
10087  Jordan B Peterson  
11

In [32]:
video_df.to_csv("dataFolder/processed/cleanedDataFrame.csv", index=False)

In [21]:
comments_df.to_csv("dataFolder/processed/cleanedComments.csv", index=False)