# Combining Data
After collecting about a million comments, it's time to compile them. We'll remove duplicates based on Comment ID.

In [2]:
import pandas as pd
from utils import parse_yt_datetime, parse_and_concat_csvs

In [3]:
# Depending on the features you were interested in, your headers may or may not look the same
header = ['Comment', 'Comment ID', 'Replies Count', 'Like Count', 'Updated At']

In [4]:
csvs = ['aya_blackpink_time_8.csv', 'aya_blackpink_time_9.csv', 'aya_blackpink_time_10.csv', 
        'aya_blackpink_time_11.csv', 'aya_blackpink_time_12.csv', 'aya_blackpink_time_13.csv', 
        'aya_blackpink_time_14.csv']


In [None]:
combined = parse_and_concat_csvs(csvs, header)

In [85]:
combined.shape

(1143597, 5)

# Sort by Comment ID
Not by the text content of the comments, since commenters may write the same things.

In [123]:
combined.sort_values("Comment ID", inplace = True) 
  


In [124]:
len(combined['Comment ID'])-len(combined['Comment ID'].drop_duplicates())

652844

652844 is the number of duplicate comments we have in our csv files.

In [125]:
combined.duplicated(subset='Comment ID', keep='first').sum()

652844

In [126]:
removed_duplicates = combined.drop_duplicates(subset="Comment ID", keep='first', inplace=False)

In [127]:
sort_by_update_time = removed_duplicates.sort_values("Updated At")

In [128]:
sort_by_update_time.reset_index()

Unnamed: 0,index,Comment,Comment ID,Replies Count,Like Count,Updated At
0,40295,😂😂😂😂😂,UgyJUozOOb_ng7rmIMp4AaABAg,0,0,2020-06-27T08:20:15Z
1,40297,ستريم ستريم ستريم نبي نوصل ٨٢ قبل مايخلص اليوم...,Ugw8PM1u0bsWOMzzGCZ4AaABAg,1,0,2020-06-27T08:20:15Z
2,40298,81M congratulations blink blackpink,Ugymm_tQHxSp7U2wH8t4AaABAg,0,1,2020-06-27T08:20:15Z
3,40299,#1 in Poland,UgwymZEm7y2vyppPyCh4AaABAg,0,1,2020-06-27T08:20:15Z
4,40296,QUEEN BLACKPINK,UgzotV8vjKKQonnXCUN4AaABAg,0,1,2020-06-27T08:20:15Z
5,40291,After 1 year:/🤲🏻,UgzNygGLylMKf9W5TVZ4AaABAg,0,0,2020-06-27T08:20:16Z
6,40293,New record.,Ugx30JVR3BL6Mhx_j5F4AaABAg,0,1,2020-06-27T08:20:16Z
7,40294,Bp broke the record for most viewed mv in 24/h...,UgyEbIl9NuXnSMx21NB4AaABAg,1,7,2020-06-27T08:20:16Z
8,40292,ดูอีกกี่รอบ ก็ใช้คำว่าสวยได้เปลืองมาก 😄,Ugxoz3mXLw3V9Xkkd6V4AaABAg,0,2,2020-06-27T08:20:16Z
9,40288,100m view in 24hour please😍😍,UgwkB-_LwsAnVxMYJTJ4AaABAg,0,0,2020-06-27T08:20:17Z


In [129]:
sort_by_update_time.shape

(490753, 5)

In [130]:
sort_by_update_time.to_csv('comments_by_time_june27_july08.csv', index=False)

In [132]:
testing = pd.read_csv('comments_by_time_june27_july08.csv', lineterminator='\n')

In [136]:
testing.shape

(490753, 5)

In [135]:
testing.head()

Unnamed: 0,Comment,Comment ID,Replies Count,Like Count,Updated At
0,😂😂😂😂😂,UgyJUozOOb_ng7rmIMp4AaABAg,0,0,2020-06-27T08:20:15Z
1,ستريم ستريم ستريم نبي نوصل ٨٢ قبل مايخلص اليوم...,Ugw8PM1u0bsWOMzzGCZ4AaABAg,1,0,2020-06-27T08:20:15Z
2,81M congratulations blink blackpink,Ugymm_tQHxSp7U2wH8t4AaABAg,0,1,2020-06-27T08:20:15Z
3,#1 in Poland,UgwymZEm7y2vyppPyCh4AaABAg,0,1,2020-06-27T08:20:15Z
4,QUEEN BLACKPINK,UgzotV8vjKKQonnXCUN4AaABAg,0,1,2020-06-27T08:20:15Z
