In [57]:
import pandas as pd;

In [69]:
# Create a single dataframe with the concatenation of all input csv files, adding a column called country
countries = ['CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']
dfs = []

for country in countries:
    df = pd.read_csv(f'trendingYT/{country}videos.csv.zst', compression='zstd', encoding='utf-8', encoding_errors='ignore')
    df['country'] = country
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)

In [71]:
df_all.shape

(375942, 17)

In [72]:
# Extract all videos that have no tag.
df_no_tags = df_all[df_all['tags'].isna() | (df_all['tags'] == '[none]') | (df_all['tags'] == '')]
print(f"Videos with no tags: {len(df_no_tags)}")
# df_no_tags.to_csv('videos_no_tags.csv', index=False)

Videos with no tags: 37698


In [73]:
# For each channel, determine the total number of views
channel_views = df_all.groupby('channel_title')['views'].sum().sort_values(ascending=False)
channel_views

channel_title
ChildishGambinoVEVO     11016766510
Marvel Entertainment    10430605449
NickyJamTV               9479859505
Ozuna                    8623329509
ibighit                  8205572221
                           ...     
NavylittleMonster               365
Videostendencias                302
No Comment TV                   284
Sport Life                      163
Alexander Redking               153
Name: views, Length: 37824, dtype: int64

In [74]:
# Save all rows with disabled comments and disabled ratings, or that have video_error_or_removed in a new dataframe called excluded, and remove those rows from the original dataframe.
excluded = df_all[
    (df_all['comments_disabled'] == True) & (df_all['ratings_disabled'] == True) |
    (df_all['video_error_or_removed'] == True)
]

print (excluded.shape)

df_all = df_all[~df_all.index.isin(excluded.index)]
df_all.shape

(2620, 17)


(373322, 17)

In [75]:
# Add a like_ratio column storing the ratio between the number of likes and of dislikes
df_all.loc[:,'like_ratio'] = df_all['likes'] / df_all['dislikes'].replace(0, pd.NA)
df_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all.loc[:,'like_ratio'] = df_all['likes'] / df_all['dislikes'].replace(0, pd.NA)


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,country,like_ratio
0,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyonc√©,EminemVEVO,10,2017-11-10T17:00:03.000Z,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579,787425,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyonc√© i...,CA,18.135076
1,0dBIkQ4Mz1M,17.14.11,PLUSH - Bad Unboxing Fan Mail,iDubbbzTV,23,2017-11-13T17:00:00.000Z,"plush|""bad unboxing""|""unboxing""|""fan mail""|""id...",1014651,127794,1688,13030,https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg,False,False,False,STill got a lot of packages. Probably will las...,CA,75.707346
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146035,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ‚ñ∂ \n\nSUBSCRIBE ‚ñ∫ http...,CA,27.3525
3,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095828,132239,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,CA,66.485168
4,2Vv-BfVoq4g,17.14.11,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,10,2017-11-09T11:04:14.000Z,"edsheeran|""ed sheeran""|""acoustic""|""live""|""cove...",33523622,1634130,21082,85067,https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg,False,False,False,üéß: https://ad.gt/yt-perfect\nüí∞: https://atlant...,CA,77.513044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375937,BZt0qjTWNhw,18.14.06,The Cat Who Caught the Laser,AaronsAnimals,15,2018-05-18T13:00:04.000Z,"aarons animals|""aarons""|""animals""|""cat""|""cats""...",1685609,38160,1385,2657,https://i.ytimg.com/vi/BZt0qjTWNhw/default.jpg,False,False,False,The Cat Who Caught the Laser - Aaron's Animals,US,27.552347
375938,1h7KV2sjUWY,18.14.06,True Facts : Ant Mutualism,zefrank1,22,2018-05-18T01:00:06.000Z,[none],1064798,60008,382,3936,https://i.ytimg.com/vi/1h7KV2sjUWY/default.jpg,False,False,False,,US,157.089005
375939,D6Oy4LfoqsU,18.14.06,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,24,2018-05-18T17:34:22.000Z,I gave safiya nygaard a perfect hair makeover ...,1066451,48068,1032,3992,https://i.ytimg.com/vi/D6Oy4LfoqsU/default.jpg,False,False,False,I had so much fun transforming Safiyas hair in...,US,46.577519
375940,oV0zkMe1K8s,18.14.06,How Black Panther Should Have Ended,How It Should Have Ended,1,2018-05-17T17:00:04.000Z,"Black Panther|""HISHE""|""Marvel""|""Infinity War""|...",5660813,192957,2846,13088,https://i.ytimg.com/vi/oV0zkMe1K8s/default.jpg,False,False,False,How Black Panther Should Have EndedWatch More ...,US,67.799368


In [76]:
# Cluster the publish time into 10-minute intervals (e.g. from 02:20 to 02:30)
df_all['publish_time'] = pd.to_datetime(df_all['publish_time'])

floor_times = df_all['publish_time'].dt.floor('10min')
ceil_times = floor_times + pd.Timedelta(minutes=10)

df_all['interval'] = floor_times.dt.strftime('%H:%M') + ' to ' + ceil_times.dt.strftime('%H:%M')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all['publish_time'] = pd.to_datetime(df_all['publish_time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all['interval'] = floor_times.dt.strftime('%H:%M') + ' to ' + ceil_times.dt.strftime('%H:%M')


In [77]:
df_all

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,country,like_ratio,interval
0,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyonc√©,EminemVEVO,10,2017-11-10 17:00:03+00:00,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579,787425,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyonc√© i...,CA,18.135076,17:00 to 17:10
1,0dBIkQ4Mz1M,17.14.11,PLUSH - Bad Unboxing Fan Mail,iDubbbzTV,23,2017-11-13 17:00:00+00:00,"plush|""bad unboxing""|""unboxing""|""fan mail""|""id...",1014651,127794,1688,13030,https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg,False,False,False,STill got a lot of packages. Probably will las...,CA,75.707346,17:00 to 17:10
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12 19:05:24+00:00,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146035,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ‚ñ∂ \n\nSUBSCRIBE ‚ñ∫ http...,CA,27.3525,19:00 to 19:10
3,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12 18:01:41+00:00,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095828,132239,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,CA,66.485168,18:00 to 18:10
4,2Vv-BfVoq4g,17.14.11,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,10,2017-11-09 11:04:14+00:00,"edsheeran|""ed sheeran""|""acoustic""|""live""|""cove...",33523622,1634130,21082,85067,https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg,False,False,False,üéß: https://ad.gt/yt-perfect\nüí∞: https://atlant...,CA,77.513044,11:00 to 11:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375937,BZt0qjTWNhw,18.14.06,The Cat Who Caught the Laser,AaronsAnimals,15,2018-05-18 13:00:04+00:00,"aarons animals|""aarons""|""animals""|""cat""|""cats""...",1685609,38160,1385,2657,https://i.ytimg.com/vi/BZt0qjTWNhw/default.jpg,False,False,False,The Cat Who Caught the Laser - Aaron's Animals,US,27.552347,13:00 to 13:10
375938,1h7KV2sjUWY,18.14.06,True Facts : Ant Mutualism,zefrank1,22,2018-05-18 01:00:06+00:00,[none],1064798,60008,382,3936,https://i.ytimg.com/vi/1h7KV2sjUWY/default.jpg,False,False,False,,US,157.089005,01:00 to 01:10
375939,D6Oy4LfoqsU,18.14.06,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,24,2018-05-18 17:34:22+00:00,I gave safiya nygaard a perfect hair makeover ...,1066451,48068,1032,3992,https://i.ytimg.com/vi/D6Oy4LfoqsU/default.jpg,False,False,False,I had so much fun transforming Safiyas hair in...,US,46.577519,17:30 to 17:40
375940,oV0zkMe1K8s,18.14.06,How Black Panther Should Have Ended,How It Should Have Ended,1,2018-05-17 17:00:04+00:00,"Black Panther|""HISHE""|""Marvel""|""Infinity War""|...",5660813,192957,2846,13088,https://i.ytimg.com/vi/oV0zkMe1K8s/default.jpg,False,False,False,How Black Panther Should Have EndedWatch More ...,US,67.799368,17:00 to 17:10


In [78]:
# For each interval, determine the number of videos, average number of likes and of dislikes.

df_all.groupby('interval').agg({
    'video_id': 'count',
    'likes': 'mean',
    'dislikes': 'mean'
}).rename(columns={'video_id': 'num videos', 'likes': 'avg likes', 'dislikes': 'avg dislikes'})

Unnamed: 0_level_0,num videos,avg likes,avg dislikes
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00:00 to 00:10,2897,61288.115637,3808.149465
00:10 to 00:20,1509,22748.138502,1449.836315
00:20 to 00:30,1241,21378.280419,1072.344883
00:30 to 00:40,1614,36853.560719,955.890954
00:40 to 00:50,1269,42198.623325,1909.301812
...,...,...,...
23:10 to 23:20,1554,22783.840412,1110.168597
23:20 to 23:30,1438,30696.510431,1177.020862
23:30 to 23:40,1666,18159.173469,874.184874
23:40 to 23:50,1466,45736.581855,9341.549795


In [96]:
#  For each tag, determine the number of videos

# tags_dict = {}
# for tags_str in df_all['tags'].dropna():
#     if tags_str and tags_str != '[none]':
#         # Split by pipe character
#         tags = [tag.strip().strip('"') for tag in str(tags_str).split('|')]
#         for tag in tags:
#             if tag in tags_dict:
#                 tags_dict[tag] += 1
#             else:
#                 tags_dict[tag] = 1


# the above solution is fine but it is slow so here is a more pandas way of doing it

tags_list = []
for tags_str in df_all['tags'].dropna():
    if tags_str and tags_str != '[none]':
        # Split by pipe character
        tags = [tag.strip().strip('"') for tag in str(tags_str).split('|')]
        tags_list.extend(tags)

tag_counts = pd.Series(tags_list).value_counts()
tag_counts

funny                    15039
comedy                   12351
2018                     11383
news                      6363
music                     5909
                         ...  
‡§∏‡§Ç‡§ï‡§≤‡•ç‡§™ ‡§∏‡•á ‡§∏‡§ø‡§¶‡•ç‡§ß‡§ø             1
Sankalp Se Siddhi            1
ICICI Fraud                  1
‡§Ü‡§à‡§∏‡•Ä‡§Ü‡§à‡§∏‡•Ä‡§Ü‡§à ‡§´‡§∞‡•ç‡§ú‡•Ä‡§µ‡§æ‡§°‡§º‡§æ        1
langford                     1
Name: count, Length: 848605, dtype: int64

In [102]:
# Find the tags with the largest number of videos
tag_counts.sort_values(ascending=False).head(1)

funny    15039
Name: count, dtype: int64

In [None]:
# For each (tag, country) pair, compute average ratio likes/dislikes