# Congress Youtube Links

In [57]:
import os
import sys
import json
import time
import datetime
import shutil
from collections import Counter

from tqdm import tqdm_notebook as tqdm
import pandas as pd

# this is to import youtube_api from the py directory
sys.path.append(os.path.abspath('../py')) 
import youtube_api as yt
import s3
from runtimestamp.runtimestamp import runtimestamp
runtimestamp()

Updated 2018-04-10 18:43:47.794373
By ly501
Using Python 3.6.5
On Linux-3.10.0-514.10.2.el7.x86_64-x86_64-with-centos-7.3.1611-Core


In [24]:
key = os.environ.get('YT_KEY')

In [3]:
s3_path = 's3://smapp-nyu/projects/ideology_estimation/congress/'

In [4]:
csv_files = s3.ls(s3_path + 'url_csvs/*.csv')

In [5]:
len(csv_files)

975

In [6]:
youtube = [
    'www.youtube.com',
    'www.youtu.be',
    'youtube.com',
    'youtu.be'
]

In [129]:
domain_counter = Counter()
df_yt = pd.DataFrame()
for f in tqdm(csv_files):
    df = s3.read_csv(f)
    df['filename'] = f
    
    # count domains
    domain_counter.update(df['domain'].tolist())
    
    # filter out youtube
    df_ = df[df['domain'].isin(youtube)]
    df_yt = df_yt.append(df_, ignore_index=True)





In [63]:
domain_counter.most_common(30)

[('twitter.com', 152389),
 ('www.youtube.com', 51179),
 ('www.facebook.com', 47524),
 ('1.usa.gov', 44611),
 ('www.washingtonpost.com', 19030),
 ('www.nytimes.com', 15341),
 ('thehill.com', 14885),
 ('tinyurl.com', 10587),
 ('www.politico.com', 8867),
 ('instagram.com', 7477),
 ('www.instagram.com', 6932),
 ('www.cnn.com', 6620),
 ('amp.twimg.com', 5963),
 ('www.foxnews.com', 5276),
 ('www.c-span.org', 5209),
 ('medium.com', 4906),
 ('ow.ly', 4708),
 ('go.usa.gov', 4578),
 ('www.huffingtonpost.com', 4440),
 ('www.usatoday.com', 4100),
 ('www.washingtonexaminer.com', 3748),
 ('cards.twitter.com', 3660),
 ('www.wsj.com', 3105),
 ('www.snappytv.com', 2925),
 ('abcnews.go.com', 2854),
 ('www.pscp.tv', 2772),
 ('www.latimes.com', 2704),
 ('on.wsj.com', 2676),
 ('www.vox.com', 2568),
 ('www.speaker.gov', 2483)]

In [131]:
len(df_yt)

51319

In [204]:
f_top_domains = '/scratch/olympus/projects/ideology_scaling/congress/top_domains.json'
with open(f_top_domains , 'w+') as f:
    f.write(json.dumps(domain_counter))
shutil.chown(f_top_domains, group='smapp')

In [203]:
f_yt_raw = '/scratch/olympus/projects/ideology_scaling/congress/youtube_links_raw.csv'
df_yt.to_csv(f_yt_raw, index=False)
shutil.chown(f_yt_raw, group='smapp')

In [138]:
df_yt.head(2)

Unnamed: 0,url,raw_url,domain,unshortened,tries,tweet_id,created_at,filename
0,https://www.youtube.com/watch?v=Ne4tGG9-00A,https://www.youtube.com/watch?v=Ne4tGG9-00A,www.youtube.com,1.0,0.0,9.714021e+17,Wed Mar 07 15:08:06 +0000 2018,s3://smapp-nyu/projects/ideology_estimation/co...
1,https://www.youtube.com/watch?v=Ne4tGG9-00A,https://www.youtube.com/watch?v=Ne4tGG9-00A,www.youtube.com,1.0,0.0,9.710465e+17,Tue Mar 06 15:35:11 +0000 2018,s3://smapp-nyu/projects/ideology_estimation/co...


Let's strip the video IDs from the urls and get a list of unique video IDS.

In [154]:
df_yt['yt_id'] = df_yt['url'].apply(yt.strip_video_id_from_url)

In [155]:
video_ids = df_yt['yt_id'].unique().tolist()

How many videos do we need to get metadata for?

In [156]:
len(video_ids)

39137

In [157]:
video_ids[0]

'Ne4tGG9-00A'

We can get the metadata in chunks of 50 videos using `yt.get_video_metadata()`.

In [65]:
def chunks(list_, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(list_), n):
        yield list_[i:i + n]

In [158]:
meta = []
for chunk in tqdm(chunks(video_ids, 50)):
    data = yt.get_video_metadata(chunk, key, verbose=1)
    meta.extend(data)
    time.sleep(.1)




Let's convert the list of dictionaries into a Pandas dataframe

In [160]:
df_yt_resolved = pd.DataFrame(meta)

In [162]:
df_yt_resolved.head(2)

Unnamed: 0,video_id,channel_title,channel_id,video_publish_date,video_title,video_description,video_category,video_view_count,video_comment_count,video_like_count,video_dislike_count,video_thumbnail,collection_date
0,Ne4tGG9-00A,Bruce Rauner,UCrzbsTXJDuIPwaDk7aTKlww,2018-03-05 15:56:15,Governor Rauner Joins Fox & Friends | Bruce Ra...,SUBSCRIBE for the latest news and updates from...,25,656,,9,2,https://i.ytimg.com/vi/Ne4tGG9-00A/hqdefault.jpg,2018-04-10 12:27:25.952564
1,bdkAd5yiPI0,Bruce Rauner,UCrzbsTXJDuIPwaDk7aTKlww,2018-02-26 22:13:18,Governor Rauner Discusses Janus v. AFSCME,SUBSCRIBE for the latest news and updates from...,25,1497,,27,18,https://i.ytimg.com/vi/bdkAd5yiPI0/hqdefault.jpg,2018-04-10 12:27:25.952605


We can save this to disk:

In [202]:
f_yt_resolved = '/scratch/olympus/projects/ideology_scaling/congress/youtube_links_resolved.tsv'
df_yt_resolved.to_csv(f_yt_resolved, index=False, sep='\t')
shutil.chown(f_yt_resolved, group='smapp')

and work with it later...

In [7]:
f_yt_resolved = '/scratch/olympus/projects/ideology_scaling/congress/youtube_links_resolved.tsv'
df_yt_resolved = pd.read_csv(f_yt_resolved, sep='\t')

f_yt_raw = '/scratch/olympus/projects/ideology_scaling/congress/youtube_links_raw.csv'
df_yt = pd.read_csv(f_yt_raw)

Note these are only unique videos. Let's join it into all the Youtube links shared.

In [8]:
df_link_yt = df_yt.merge(df_yt_resolved, left_on = 'yt_id', right_on='video_id', how='left')

What are the top shared channels for all of Congress?

In [25]:
df_link_yt['channel_title'].value_counts().head(30)

House Republicans                                 561
SenateDemocrats                                   437
Homeland Security Committee                       361
Bevin-Hampton Administration                      331
House Democrats                                   318
doggett                                           313
Senate Majority Leader Mitch McConnell            312
Senator Enzi                                      306
senatormikelee                                    294
barrassowyo                                       290
GohmertTX01                                       282
oversightandreform                                251
Senator Shelley Moore Capito                      227
Ways and Means Committee                          226
U.S. House Armed Services Committee               218
RepDianeBlack                                     218
RepMikeKelly                                      216
Nancy Pelosi                                      216
The Obama White House       

## What are the most shared videos?

In [9]:
df_link_yt['video_title'].value_counts().head(10)

Dems Join Refugees & Immigrants Outside SCOTUS To Demand POTUS Trump Reverse Exec Orders             74
Jimmy Kimmel Reveals Details of His Sonâ€™s Birth & Heart Disease                                      74
#NotOneCent for Iran Until it Compensates its Victims                                                73
Comeback                                                                                             46
Congresswoman Johnson gives moving House floor speech on Dallas Shootings                            31
House Democrats Hold Hearing: 'The Flint Water Crisis: Lessons for Protecting America's Children'    30
The US Constitution and Donald Trump's Conflicts of Interest                                         30
#EndGunViolence                                                                                      28
Tesla's Solar Facility is Part of Hawaii's Goal For Energy Independence (HBO)                        26
Make It In America: What's Next?                              

## What are the categories of videos being sent?

In [124]:
with open('../data/categories.json', 'r') as f:
    video_categories = json.load(f)

In [126]:
df_link_yt['video_category'].replace(video_categories).value_counts()

News & Politics          30161
People & Blogs            6135
Film & Animation          1083
Nonprofits & Activism     1012
Entertainment              867
Education                  678
Science & Technology       382
Comedy                     377
Music                      228
Sports                     128
Travel & Events             95
Autos & Vehicles            84
Howto & Style               47
Pets & Animals              18
Gaming                      10
Shows                        2
Trailers                     1
Name: video_category, dtype: int64

## Who is sharing what?

In [10]:
def get_user_id_from_file(f):
    return f.split('url_csvs/')[-1].replace('_urls.csv', '')
df_link_yt['twitter_id'] = df_link_yt['filename'].apply(get_user_id_from_file)

In [26]:
v_id = df_link_yt['video_id'].value_counts().index[1]
df_link_yt[df_link_yt['video_id'] == v_id]['twitter_id'].unique()

array(['1055907624', '1058520120', '1058717720', '1060984272',
       '1061029050', '1068499286', '1077121945', '1089859058',
       '109287731', '111635527', '111635980', '1135486501', '1155335864',
       '1222257180', '133028836', '140519774', '1410590874', '14676022',
       '14984637', '153944899', '15764644', '15808765', '168502762',
       '171598736', '17494010', '17907578', '18137749', '18674498',
       '18909919', '22523087', '232268199', '242836537', '242892689',
       '2461810448', '249348006', '25781141', '278005891', '28599820',
       '2916086925', '29442313', '2968007206', '2968451607', '3018670151',
       '3023272478', '30354991', '3044993235', '325162736', '38254095',
       '415117361', '4304448314', '432676344', '43963249', '47747074',
       '50452197', '72198806', '80612021', '813286', '814179031956488192',
       '816181091673448448', '817050219007328258', '818713465653051392',
       '87510313', '90639372', '942156122', '946549322', '995193054'],
      dtype=

## Analyzing the Most Shared Channel

In [30]:
top_50_channels = df_link_yt['channel_id'].value_counts().head(50).index
channel_id = top_50_channels[1]

In [40]:
yt.get_channel_metadata(channel_id, key)

OrderedDict([('id', 'UCpgILFSGxY-9mQR79fLSt2A'),
             ('title', 'SenateDemocrats'),
             ('publish_date', datetime.datetime(2008, 8, 11, 19, 14, 4)),
             ('keywords',
              'Senate Senator Democrat Democrats Democratic Leadership Caucus Majority Leader Harry Reid Durbin Schumer Murray Stabenow Congress government legislative legislation bill federal'),
             ('description',
              'Video updates from Senate Democratic Leader Chuck Schumer and Democrats in the United States Senate.'),
             ('view_count', '2186695'),
             ('video_count', '1193'),
             ('subscription_count', '4275'),
             ('playlist_id_likes', None),
             ('playlist_id_uploads', 'UUpgILFSGxY-9mQR79fLSt2A'),
             ('topic_ids', '["/m/05qt0", "/m/05qt0", "/m/098wr"]'),
             ('collection_date',
              datetime.datetime(2018, 4, 10, 18, 41, 0, 866410))])

## Building Relationships
1. Channels that the Channel is subscribed to
2. Channels that the Channel features
3. Channels that the Channel links to in Playlists
4. Channels that the Channels links to in Video Descriptions

## 1

In [42]:
yt.get_subscriptions(channel_id, key)



[]

## 2

In [43]:
yt.get_featured_channels(channel_id, key)

{'UCpgILFSGxY-9mQR79fLSt2A': ['UC-ABttxh8uQv_10qmwGaidw',
  'UCb8jq3TvQ3AzKsexhWfFLoA',
  'UCTH9zV8Imw09J5bOoTR18_A',
  'UCD_DaKNac0Ta-2PeHuoQ1uA',
  'UC6FlymqNS1VettnVZa7goPA',
  'UCYRWRvUxtjaHnFMCbdd94tg',
  'UCbcEa40PIFpLpdDe06n3F3Q']}

## 3

In [51]:
playlists = yt.get_playlists(channel_id, key)
df_playlists = pd.DataFrame(playlists)
df_playlists.head(2)

Unnamed: 0,playlist_name,playlist_id,playlist_publish_date,playlist_n_videos,channel_id,channel_name
0,#SaveTheInternet,PLrXlgSrg2WETFdmh1gwGI_YiSaq6_i2_u,2018-02-26 16:53:17,4,UCpgILFSGxY-9mQR79fLSt2A,SenateDemocrats
1,#AmericaSpeaksOut,PLrXlgSrg2WERe9DxAloPQtcDPEyp_6qXC,2017-06-16 23:52:16,31,UCpgILFSGxY-9mQR79fLSt2A,SenateDemocrats


In [48]:
df_playlist_videos = pd.DataFrame()
for playlist in tqdm(df_playlists['playlist_id'].tolist()):
    playlist = yt.get_video_urls_from_playlist_id(playlist, key, verbose=0)
    df_ = pd.DataFrame(playlist)
    df_playlist_videos = df_playlist_videos.append(df_, ignore_index=False)




In [64]:
df_playlist_videos.head(2)

Unnamed: 0,publish_date,video_id,channel_id
0,2018-02-26 16:55:57,9l_GgulneqM,UCpgILFSGxY-9mQR79fLSt2A
1,2018-02-26 23:38:54,ltzy5vRmN8Q,UCpgILFSGxY-9mQR79fLSt2A


In [50]:
df_playlist_videos['channel_id'].unique()

array(['UCpgILFSGxY-9mQR79fLSt2A'], dtype=object)

## 4

In [57]:
import re
import itertools
from urllib.parse import urlparse

In [58]:
def get_link(tweet):
    '''
    Returns a generator containing tweet metadata about media.
    '''
        
    row = {
        'channel_id': tweet['channel_id'],
        'video_id': tweet['video_id'],
        'video_publish_date': tweet['video_publish_date'],
        'video_description' : str(tweet['video_description'])
    }
    
    if row['video_description']: 
        list_urls = re.findall("(?P<url>https?://[^\s]+)", row['video_description'])
        if list_urls:
            for url in list_urls:
                r = row.copy()
                r['link.url_long'] = url

                if r['link.url_long']:
                    r['link.domain'] = urlparse(r['link.url_long']).netloc.lower()
                    yield r

In [59]:
descriptions = df_link_yt['video_description'].tolist()

In [60]:
df_links = pd.DataFrame(
    list(
        itertools.chain.from_iterable(
            [ get_link(tweet) for i, tweet in tqdm(df_link_yt.iterrows()) ]
        )
    )
)




In [61]:
len(df_links)

14380

## The Most Shared Domains among Youtube Descriptions

In [70]:
df_links['link.domain'].value_counts().head(40)

bit.ly                      1898
twitter.com                 1167
www.facebook.com            1076
www.youtube.com              841
rub.io                       429
facebook.com                 371
youtu.be                     363
hrc.io                       318
instagram.com                309
www.instagram.com            252
oversight.house.gov          232
www.twitter.com              223
goo.gl                       213
judiciary.house.gov          184
smarturl.it                  167
www.manchin.senate.gov       150
plus.google.com              130
ofa.bo                       124
www.speaker.gov              122
on.msnbc.com                  98
1.usa.gov                     97
mast.house.gov                96
www.wolfforpa.com             81
nbcnews.to                    78
youtube.com                   72
www.reaganfoundation.org      70
kuster.house.gov              68
walberg.house.gov             60
coons.senate.gov              57
www.gop.com                   55
science.ho

## TODO
How do we go go Raw Youtube links (containing subscribe pages, users, channels, custom URLS) to links?

In [18]:
# df_links[df_links['link.domain'].isin(youtube)]['link.url_long'].unique()

In [17]:
yt.get_channel_id_from_custom_url('http://www.youtube.com/user/americasgottalent')

'UCT2X19JJaJGUN7mrYuImANQ'

In [None]:
{
    'video_url' : 'http://www.youtube.com/watch?v=A0nASbOg3B8',
    'user' : 'http://www.youtube.com/user/GoldenGlobes',
    'custom' : 'http://www.youtube.com/TYTPolitics',
    'embed_video_url' : 'https://www.youtube.com/embed/N8tkueAEVTs',
    'subscriber_hub' : 'http://www.youtube.com/subscription_center?add_user=telegraphtv',
    'custom_channel_also' : 'https://www.youtube.com/c/funnyordie?sub_confirmation=1',
    'playlist' : 'https://www.youtube.com/playlist?list=PLnwt1fUa-EVgihKJ_26XtMdmGDOmABAAa'
    
}