# Imports

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [58]:
pd.set_option('display.float_format', lambda x: '%.2f' % x) 

In [59]:
videos_stats = pd.read_csv('videos_stats.csv')
videos_stats

Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
0,Apple Pay Is Killing the Physical Wallet After...,wAZZ-UWGVHI,23/08/2022,tech,3407.00,672.00,135612.00
1,The most EXPENSIVE thing I own.,b3x28s61q3c,24/08/2022,tech,76779.00,4306.00,1758063.00
2,My New House Gaming Setup is SICK!,4mgePWWCAmA,23/08/2022,tech,63825.00,3338.00,1564007.00
3,Petrol Vs Liquid Nitrogen | Freezing Experimen...,kXiYSI7H2b0,23/08/2022,tech,71566.00,1426.00,922918.00
4,Best Back to School Tech 2022!,ErMwWXQxHp0,08/08/2022,tech,96513.00,5155.00,1855644.00
...,...,...,...,...,...,...,...
1876,Should You Learn Machine Learning?,AO6urf07KjE,14/06/2021,machine learning,10259.00,416.00,386360.00
1877,Todos podemos aprender Machine learning,7ClLKBUvmRk,08/10/2017,machine learning,2981.00,72.00,431421.00
1878,"Andrew Ng: Deep Learning, Education, and Real-...",0jspaMLxBig,20/02/2020,machine learning,5198.00,443.00,226152.00
1879,What is Machine Learning?,f_uwKZIAeM0,11/01/2017,machine learning,,,


In [60]:
comments = pd.read_csv('comments.csv')
comments

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95,1
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19,0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161,2
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8,0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34,2
...,...,...,...,...
18404,cyLWtMSry58,I really like the point about engineering tool...,0,2
18405,cyLWtMSry58,I’ve just started exploring this field. And th...,20,2
18406,cyLWtMSry58,Excelente video con una pregunta filosófica pr...,1,1
18407,cyLWtMSry58,"Hey Daniel, just discovered your channel a cou...",35,2


# Cleaning the data

looking for cells with invalid values

In [61]:
videos_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         1881 non-null   object 
 1   Video ID      1881 non-null   object 
 2   Published At  1881 non-null   object 
 3   Keyword       1881 non-null   object 
 4   Likes         1879 non-null   float64
 5   Comments      1879 non-null   float64
 6   Views         1879 non-null   float64
dtypes: float64(3), object(4)
memory usage: 103.0+ KB


In [62]:
videos_stats.isna().sum()

Title           0
Video ID        0
Published At    0
Keyword         0
Likes           2
Comments        2
Views           2
dtype: int64

In [63]:
videos_stats = videos_stats.dropna()

Checking if there is invalid values for the likes and the comments

In [64]:
videos_stats[(videos_stats['Likes']<=0) | ( videos_stats['Comments']<=0) ] 

Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
243,How To Build A Business That Works | Brian Tra...,MN7yfV4UuCI,09/02/2021,business,-1.0,1144.0,676300.0
266,ตลาดถุงมือยางทรุด! ฉุดราคาร่วงลงทุนหด | BUSINE...,cG37cEi1nPc,23/08/2022,business,-1.0,13.0,12332.0
583,DON&#39;T GOOGLE THIS STUFF,NAV2laKrDv4,28/08/2017,google,-1.0,18676.0,6515548.0
1016,15 Kilo Zayıf Gösteren Kız Makyajı |Arkadaşlar...,#NAME?,23/08/2022,mukbang,-1.0,335.0,27451.0
1393,Mathematics and Chemistry : MathChemistry.com ...,V_vguZj_7FE,15/04/2013,mathchemistry,0.0,1.0,25.0
1677,FULL Marvel Studios Panel from Hall H | San Di...,rf-NqJQJHBU,24/07/2022,marvel,-1.0,821.0,553892.0
1701,SURVIVOR 🦀🦂 Best Action Movies 2022 🦀🦂 Latest ...,__7MkaWFObQ,20/08/2022,movies,105.0,-1.0,28219.0
1709,BEST Auditions Of Songs From Movies | Amazing ...,ZcBBGC8_mfU,25/07/2022,movies,20138.0,-1.0,5351960.0
1710,The Magnificent Seven - Full Movie In English ...,1MZY_0tYmrs,29/12/2021,movies,-1.0,180.0,2389222.0
1711,TOP SECRET MISSION ⭐🌟 Best Action Movies 2022 ...,PIFa7wJ9c0c,18/07/2022,movies,643.0,0.0,24012.0


There is 11 entries with wrong values in either likes and comments 
we can fix it by first checking if those videos are actuall existing videos by searching or validating using the video Id which is the link of the video but, they are few so it's unecceary, we will drop them

In [65]:
videos_stats = videos_stats[(videos_stats['Likes']>0) & ( videos_stats['Comments']>0) ]

Checking the datatypes

In [93]:
videos_stats.dtypes

Title                   object
Video ID                object
Published At    datetime64[ns]
Keyword                 object
Likes                    int64
Comments                 int64
Views                    int64
dtype: object

In [67]:
videos_stats['Published At'] = pd.to_datetime(videos_stats ['Published At'], format='%d/%m/%Y', errors='coerce')

changing the data type of  views, likes and comments to int

In [91]:
# transforming columns from float to int64
videos_stats['Views'] = videos_stats['Views'].astype('int64')
videos_stats['Likes'] = videos_stats['Likes'].astype('int64')
videos_stats['Comments'] = videos_stats['Comments'].astype('int64')

In [92]:
videos_stats.describe()

Unnamed: 0,Likes,Comments,Views
count,1868.0,1868.0,1868.0
mean,171051.27,7898.25,11672135.57
std,798466.37,37986.47,108761106.89
min,1.0,1.0,63.0
25%,2705.25,201.0,85156.25
50%,15011.0,820.0,591947.0
75%,61091.75,3411.5,2814055.25
max,16445558.0,732818.0,4034122271.0


In [94]:
# Check for empty or whitespace-only titles
empty_titles = videos_stats['Title'].str.strip() == ''

# Check for non-string titles
non_string_titles = videos_stats['Title'].apply(lambda x: not isinstance(x, str))


In [95]:
videos_stats[empty_titles ]

Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views


In [96]:
videos_stats[non_string_titles]

Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views


Also we will find in some titles text like "& #39;" which  HTML entity representing the apostrophe character

The ID length should be 11 characters,so if we checked for enteries that doesn't met this critaria, we will find some with missing video Id, but they are still valid videos so we will keep it

In [97]:
invalid_length_video_ids = videos_stats['Video ID'].apply(lambda x: len(x) != 11)
videos_stats[invalid_length_video_ids]

Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
36,My Everyday Tech: 2022!,#NAME?,2022-01-07,tech,134194,6102,4098623
44,Sushi Chef Answers Sushi Questions From Twitte...,#NAME?,2022-05-26,tech,77411,2098,2474364
45,Futurum Research&#39;s Daniel Newman breaks do...,#NAME?,2022-08-24,tech,24,45,3461
48,How Biden&#39;s student loan forgiveness progr...,#NAME?,2022-08-24,news,1029,2347,97434
71,Deadly car bomb detonates outside Moscow,#NAME?,2022-08-22,news,6379,4853,808787
105,🎧 FOLLOW ME ON LOCO | ACHANAK BAYANAK GAMING,#NAME?,2022-08-24,gaming,1525,36,14877
119,New PS4 &amp; PS5 Games This Week,#NAME?,2022-08-23,gaming,789,77,19223
157,All Sports Golf Battle 2 | Dude Perfect,#NAME?,2017-12-04,sports,867074,43024,106014469
238,DON&#39;T USE DOMAIN.COM Before Watch THIS VID...,#NAME?,2022-08-01,business,114,2,2270
493,Champions Chess Tour: FTX Crypto Cup | Day 2 |...,#NAME?,2022-08-16,chess,1234,44,92760


In [98]:
# saving the video stats to  a new csv file called cleaned_video_stats.csv
videos_stats.to_csv('cleaned_video_stats.csv', index=False)


# Exploratory Data Analysis (EDA)

## Summary statistics

In [99]:
videos_stats.describe()

Unnamed: 0,Likes,Comments,Views
count,1868.0,1868.0,1868.0
mean,171051.27,7898.25,11672135.57
std,798466.37,37986.47,108761106.89
min,1.0,1.0,63.0
25%,2705.25,201.0,85156.25
50%,15011.0,820.0,591947.0
75%,61091.75,3411.5,2814055.25
max,16445558.0,732818.0,4034122271.0


The dataset provides the following insights from the summary statistics:

The dataset contains 1,868 videos.

Likes: The number of likes ranges from 1 to over 16 million, with a mean of approximately 171,000 likes, with a high standard deviation indicating a wide range of view counts.

Comments: The number of comments ranges from 1 to over 732,000, with a mean of approximately 7,898 comments.

Views: The number of views ranges from 63 to over 4 billion, with a mean of approximately 11.67 million views.

The median values for views, likes, and comments are significantly lower than the mean, suggesting that a few videos with extremely high engagement skew the data.

In [120]:
# Group by keyword and calculate mean engagement metrics
keyword_engagement = videos_stats.groupby('Keyword').mean()[['Views', 'Likes', 'Comments']].reset_index()

keyword_engagement.sort_values(by='Views', ascending=False)


  keyword_engagement = videos_stats.groupby('Keyword').mean()[['Views', 'Likes', 'Comments']].reset_index()


Unnamed: 0,Keyword,Views,Likes,Comments
17,google,105566093.91,481875.86,22736.43
0,animals,94723960.92,760775.79,21871.82
28,mrbeast,66764003.98,2105914.36,95944.48
3,bed,53893228.61,473682.18,15180.95
30,music,29364893.26,314188.11,12288.93
18,history,15047130.12,273410.66,20190.02
10,cubes,15038739.34,303061.2,6833.08
29,mukbang,11151984.2,144620.45,7613.11
1,apple,10746930.45,118078.9,8618.86
36,sports,8601204.73,90518.88,3828.02
