In [1]:
import pandas as pd
import json
import os

After cleaning the channel dataset, we will now assign `isTrending` to it

In [2]:
trending_file_path = 'C:/Users/TKN/Downloads/New-Youtube-Scraper-v3/data/yt_processed_data/processed_US_trending_data.json'
channel_videos_file_path = 'C:/Users/TKN/Downloads/New-Youtube-Scraper-v3/data/yt_processed_data/processed_channel_videos.json'
with open(trending_file_path, "r", encoding="utf-8") as f:
    trending_videos = json.load(f)
    
with open(channel_videos_file_path, "r", encoding="utf-8") as f:
    channel_videos = json.load(f)

trending_df = pd.DataFrame.from_dict(trending_videos, orient="index")
channel_df = pd.DataFrame.from_dict(channel_videos, orient="index")

In [3]:
channel_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14744 entries, V0CniCFbxLs to 9Nx849WhPFc
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   fetchedDate      14744 non-null  object 
 1   publishedAt      14744 non-null  object 
 2   elapsedDays      14744 non-null  float64
 3   title            14744 non-null  object 
 4   description      14744 non-null  object 
 5   channelTitle     14744 non-null  object 
 6   tags             11772 non-null  object 
 7   category         14744 non-null  object 
 8   duration         14744 non-null  object 
 9   licensedContent  14744 non-null  bool   
 10  viewCount        14744 non-null  int64  
 11  avgDailyViews    14744 non-null  float64
 12  likeCount        14744 non-null  int64  
 13  commentCount     14744 non-null  int64  
 14  engagementRate   14744 non-null  float64
 15  topicCategories  14744 non-null  object 
 16  isTrending       14744 non-null  int64  
dtypes

Separate `isTrending = 1` rows from `isTrending = 0` from the channel video dataset. Rows with `isTrending = 1` will join trending dataset as trending videos.

In [None]:
print(trending_df.shape[0])

channel_df_trending = channel_df[channel_df['isTrending'] == 1]
channel_df_not_trending = channel_df[channel_df['isTrending'] == 0]

print("Trending videos:")
print(channel_df_trending.shape[0])

print("\nNot Trending videos:")
print(channel_df_not_trending.shape[0])

108
Trending videos:
613

Not Trending videos:
14131


In [5]:
# Convert any list columns to strings to avoid unhashable type errors

trending_df = pd.concat([channel_df_trending, trending_df], ignore_index=True)
'''
For dropping duplicate rows in trending_df, but there are no duplicate rows. We will keep this here for reference
for col in trending_df.columns:
	if trending_df[col].apply(lambda x: isinstance(x, list)).any():
		trending_df[col] = trending_df[col].apply(lambda x: json.dumps(x) if isinstance(x, list) else x)
trending_df.drop_duplicates(inplace=True)
'''
print(trending_df.shape[0])

721


Drop duplicate videos (the trending videos and the channel videos are obtained on different days)

In [6]:
trending_df = trending_df.sort_values(by='viewCount', ascending=False).drop_duplicates(subset=['publishedAt', 'title'], keep='first')
trending_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 651 entries, 125 to 640
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   fetchedDate      651 non-null    object 
 1   publishedAt      651 non-null    object 
 2   elapsedDays      651 non-null    float64
 3   title            651 non-null    object 
 4   description      651 non-null    object 
 5   channelTitle     651 non-null    object 
 6   tags             444 non-null    object 
 7   category         651 non-null    object 
 8   duration         651 non-null    object 
 9   licensedContent  651 non-null    bool   
 10  viewCount        651 non-null    int64  
 11  avgDailyViews    651 non-null    float64
 12  likeCount        651 non-null    int64  
 13  commentCount     651 non-null    int64  
 14  engagementRate   651 non-null    float64
 15  topicCategories  651 non-null    object 
 16  isTrending       651 non-null    int64  
dtypes: bool(1), float64

#### Assigning trending percentile (depreciated)

We now have 2 datasets

`trending_df` contains all videos that has `isTrending` = `1` after cleaning + assigning

`channel_df_not_trending` contains all videos with `isTrending` = 0 after cleaning + assigning

`trending_df` will be assigned `trendingPercentile` as follows:

`trendingPercentile`: If a video is on Trending, which percentile will it belong. We will order the dataset by `avgDailyViews`, then `viewCount`, `likeCount` and `commentCount` (the major indicator of Trending videos), bin the dataset into 10 parts, then assign 10 different percentiles, from `0.05` (top 95%) to `0.95` (top 5%)

In [7]:
# trending_df = trending_df.sort_values(
#     by=['avgDailyViews', 'viewCount', 'likeCount', 'commentCount'], 
#     ascending=[False, False, False, False]
# ).reset_index(drop=True)

In [8]:
# trending_df['trendingPercentile'] = pd.cut(
#     trending_df.index,
#     10, 
#     labels=[0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95], 
#     include_lowest=True
# ).astype(float)

Save the significant stats for visualization tasks

In [9]:
trending_df_viz = trending_df[['publishedAt','elapsedDays', 'title', 'channelTitle', 
                           'category','topicCategories', 'duration', 'licensedContent',
                           'viewCount', 'likeCount', 'commentCount', 'avgDailyViews',
                           'engagementRate', 'isTrending']]
trending_df_viz.info()
# trending_df_viz.to_csv('C:/Users/TKN/Downloads/New-Youtube-Scraper-v3/data/visualization_csvs/trending_data_viz.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 651 entries, 125 to 640
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   publishedAt      651 non-null    object 
 1   elapsedDays      651 non-null    float64
 2   title            651 non-null    object 
 3   channelTitle     651 non-null    object 
 4   category         651 non-null    object 
 5   topicCategories  651 non-null    object 
 6   duration         651 non-null    object 
 7   licensedContent  651 non-null    bool   
 8   viewCount        651 non-null    int64  
 9   likeCount        651 non-null    int64  
 10  commentCount     651 non-null    int64  
 11  avgDailyViews    651 non-null    float64
 12  engagementRate   651 non-null    float64
 13  isTrending       651 non-null    int64  
dtypes: bool(1), float64(3), int64(4), object(6)
memory usage: 71.8+ KB


In [10]:
channel_df_viz = channel_df_not_trending[['publishedAt','elapsedDays', 'title', 'channelTitle', 
                           'category','topicCategories', 'duration', 'licensedContent',
                           'viewCount', 'likeCount', 'commentCount', 'avgDailyViews',
                           'engagementRate', 'isTrending']] # No trending so don't include
channel_df_viz.info()
# channel_df_viz.to_csv('C:/Users/TKN/Downloads/New-Youtube-Scraper-v3/data/visualization_csvs/channel_video_data_viz.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 14131 entries, V0CniCFbxLs to 9Nx849WhPFc
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   publishedAt      14131 non-null  object 
 1   elapsedDays      14131 non-null  float64
 2   title            14131 non-null  object 
 3   channelTitle     14131 non-null  object 
 4   category         14131 non-null  object 
 5   topicCategories  14131 non-null  object 
 6   duration         14131 non-null  object 
 7   licensedContent  14131 non-null  bool   
 8   viewCount        14131 non-null  int64  
 9   likeCount        14131 non-null  int64  
 10  commentCount     14131 non-null  int64  
 11  avgDailyViews    14131 non-null  float64
 12  engagementRate   14131 non-null  float64
 13  isTrending       14131 non-null  int64  
dtypes: bool(1), float64(3), int64(4), object(6)
memory usage: 1.5+ MB


In [11]:
# def save_dfs_to_json(dfs, filenames, output_dir):
#     for df, filename in zip(dfs, filenames):
#         json_data = df.to_json(orient="index", force_ascii=False, indent=4)
#         with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f:
#             f.write(json_data)
#         print(f"DataFrame saved to {os.path.join(output_dir, filename)}")

# # Example usage:
# dfs = [trending_df, channel_df_not_trending]
# filenames = ["cleaned_US_trending_data.json", "cleaned_channel_videos.json"]
# output_dir = 'C:/Users/TKN/Downloads/New-Youtube-Scraper-v3/data/yt_processed_data_step_2'
# save_dfs_to_json(dfs, filenames, output_dir)