In [1]:
# STRUCTURED DATA PROCESSING

import csv
import numpy as np
import pandas as pd

infile = 'ted_talks_en.csv'

# create new empty list
IDlist = []

with open(infile, 'r') as csvfile:
    # the csv file reader returns a list of the csv items on each line
    IDreader = csv.reader(csvfile, dialect='excel', delimiter=',')
    
    # from each line, a list of row items, put each element in a dictionary with a key representing the data
    for line in IDreader:
      #skip lines without data, specific for each file to catch non-data lines
      if line[0] == '' or line[0].startswith('talk_id'):
          continue
      else:
          try:
            # create a dictionary for each ID
            ID = {}
            # add each piece of data under a key representing that data
            ID['Talk ID'] = line[0]
            ID['Title'] = line[1]
            ID['Speaker 1'] = line[2]
            ID['All Speakers'] = line[3]
            ID['Occupations'] = line[4]
            ID['About Speakers'] = line[5]
            ID['Views'] = line[6]
            ID['Date Recorded'] = line[7]
            ID['Date Published'] = line[8]
            ID['Event'] = line[9]
            ID['Lang Native'] = line[10]
            ID['Lang Available'] = line[11]                    
            ID['Comments'] = line[12]                    
            ID['Duration'] = line[13]                    
            ID['Topics'] = line[14]                    
            ID['Related Talks'] = line[15]                    
            ID['URL'] = line[16]                    
            ID['Description'] = line[17]                    
            ID['Transcript'] = line[18]

            # add this ID to the list
            IDlist.append(ID)
          # catch errors in file formatting (number items per line)  and print an error message
          except IndexError:
            print ('Error: ', line)
csvfile.close()

df = pd.DataFrame(IDlist, columns = ['Talk ID', 'Title', 'Speaker 1', 'Occupations', 'About Speakers', 'Views', 
                                     'Date Recorded', 'Date Published', 'Comments', 'Duration', 'Topics', 
                                     'Related Talks', 'Description', 'Transcript'])

# Convert columns to appropriate data types

# strip blank spaces from 'Talk ID' column
df['Talk ID'] = df['Talk ID'].str.strip()
# add leading zeros to 'Talk ID' column
df['Talk ID'] = df['Talk ID'].str.rjust(4, "0")
# set df index to 'Talk ID' field
df = df.set_index('Talk ID').sort_values(by = 'Talk ID')

df['Views'] = df['Views'].astype(int)

# convert dates to datetime format using pd.to_datetime()
df['Date Recorded'] = pd.to_datetime(df['Date Recorded'], format = '%Y-%m-%d')
df['Date Published'] = pd.to_datetime(df['Date Published'], format = '%Y-%m-%d')

# define replace function
def item_replace(xstr):
   return xstr.replace('','0') # in a string, replace any occurrence of ‘’ with '0'

# replace blank with 0 in 'Comments' column
df['Comments'] = df['Comments'].map(item_replace).astype(int)

# convert seconds to hours:minutes:seconds using pd.to_datetime()
df['Duration'] = df['Duration'].astype(int)
df['Duration'] = pd.to_datetime(df['Duration'], unit='s').dt.strftime("%H:%M:%S")
df

Unnamed: 0_level_0,Title,Speaker 1,Occupations,About Speakers,Views,Date Recorded,Date Published,Comments,Duration,Topics,Related Talks,Description,Transcript
Talk ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001,Averting the climate crisis,Al Gore,{0: ['climate advocate']},{0: 'Nobel Laureate Al Gore focused the world’...,3523392,2006-02-25,2006-06-27,207020,00:16:17,"['alternative energy', 'cars', 'climate change...","{243: 'New thinking on the climate crisis', 54...",With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
0002,Simple designs to save a life,Amy Smith,"{0: ['inventor', 'engineer']}","{0: 'Amy Smith designs cheap, practical fixes ...",1724438,2006-02-24,2006-08-15,100010,00:15:06,"['MacArthur grant', 'alternative energy', 'des...","{1561: 'Energy from floating algae pods', 1072...",Fumes from indoor cooking fires kill more than...,"In terms of invention, I'd like to tell you th..."
0003,How to rebuild a broken state,Ashraf Ghani,{0: ['president-elect of afghanistan']},"{0: 'Ashraf Ghani, Afghanistan’s new president...",981920,2005-07-12,2006-10-18,7050,00:18:45,"['business', 'corruption', 'culture', 'economi...","{127: 'Want to help Africa? Do business here',...",Ashraf Ghani's passionate and powerful 10-minu...,"A public, Dewey long ago observed, is constitu..."
0004,The real future of space exploration,Burt Rutan,{0: ['aircraft engineer']},"{0: ""In 2004, legendary spacecraft designer Bu...",2427994,2006-02-24,2006-10-25,109060,00:19:37,"['NASA', 'aircraft', 'business', 'design', 'en...","{141: ""Inside the world's deepest caves"", 264:...","In this passionate talk, legendary spacecraft ...","I want to start off by saying, Houston, we hav..."
0005,Great cars are great art,Chris Bangle,{0: ['car designer']},{0: 'Car design is a ubiquitous but often over...,978483,2002-02-02,2007-04-05,8010,00:20:04,"['business', 'cars', 'design', 'industrial des...","{4: 'The real future of space exploration', 26...",American designer Chris Bangle explains his ph...,"What I want to talk about is, as background, i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,"""My Fine Reward""",Tito Deler,"{0: ['blues musician', 'graphic designer']}",{0: 'Tito Deler is a soulful musician with a s...,221834,2017-11-14,2018-02-09,40,00:02:53,"['music', 'live music', 'performance', 'vocals...","{2538: '""Redemption Song""', 2795: 'Songs that ...",Blues musician Tito Deler combines the sounds ...,(Music) Sun shining up above down here it's 10...
9986,This company pays kids to do their math homework,Mohamad Jebara,{0: ['education entrepreneur']},{0: 'Mohamad Jebara is the founder and CEO of ...,1128352,2017-12-11,2018-02-08,5040,00:13:41,"['education', 'math', 'statistics', 'teaching']","{2718: '3 ways to spot a bad statistic', 2620:...",Mohamad Jebara loves mathematics -- but he's c...,"For as long as I remember, I've loved mathemat..."
9987,How to fix a broken heart,Guy Winch,"{0: ['psychologist', 'author']}",{0: 'Guy Winch asks us to take our emotional h...,10235040,2017-04-24,2018-02-05,108000,00:12:25,"['addiction', 'compassion', 'humanity', 'depre...",{2193: 'Why we all need to practice emotional ...,"At some point in our lives, almost every one o...","At some point in our lives, almost every one o..."
9988,The surprising ingredient that makes businesse...,Marco Alverà,"{0: ['businessman', 'fairness crusader']}",{0: 'Marco Alverà is an Italian/American busin...,2855104,2017-10-04,2018-02-13,2090,00:14:07,"['business', 'decision-making', 'motivation', ...","{8866: 'The business benefits of doing good', ...",What is it about unfairness? Whether it's not ...,"For me, it was not being invited to a friend's..."


In [2]:
# Are there any speakers than gave more than one talk?
speakers_max = df[['Title']].groupby(df['Speaker 1']).count()
speakers_max.sort_values(by = 'Title', ascending = False)

Unnamed: 0_level_0,Title
Speaker 1,Unnamed: 1_level_1
Alex Gendler,34
Iseult Gillespie,19
Emma Bryce,12
Daniel Finkel,11
Hans Rosling,10
...,...
Giada Gerboni,1
Gian Giudice,1
Gil Weinberg,1
Giles Duley,1


In [3]:
# what is the most common recording date? (month, year)
recording_max = df[['Title', 'Date Recorded']].groupby(['Date Recorded']).count()
recording_max.sort_values(by = 'Title', ascending = False)

Unnamed: 0_level_0,Title
Date Recorded,Unnamed: 1_level_1
2017-04-24,97
2018-04-10,95
2019-04-15,90
2007-03-03,54
2017-08-27,53
...,...
2014-03-16,1
2014-03-24,1
2014-03-25,1
2014-03-27,1


In [4]:
# is there a pattern with publishing date?
publishing_max = df[['Title', 'Date Published']].groupby(['Date Published']).count()
publishing_max.sort_values(by = 'Title', ascending = False)

Unnamed: 0_level_0,Title
Date Published,Unnamed: 1_level_1
2017-09-07,27
2019-02-15,24
2019-02-12,23
2019-03-15,22
2019-04-01,22
...,...
2013-04-25,1
2013-04-26,1
2013-04-29,1
2013-04-30,1


In [5]:
# which talk has the most number of views?
df[['Views']].mean() # average number of views is 2.148006e+06 -> 2,148,006
max_views = df.sort_values(by = 'Views', ascending = False)[:1]
max_views

Unnamed: 0_level_0,Title,Speaker 1,Occupations,About Speakers,Views,Date Recorded,Date Published,Comments,Duration,Topics,Related Talks,Description,Transcript
Talk ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
66,Do schools kill creativity?,Sir Ken Robinson,"{0: ['author', 'educator']}","{0: ""Creativity expert Sir Ken Robinson challe...",65051954,2006-02-25,2006-06-27,40903010,00:19:24,"['children', 'creativity', 'culture', 'dance',...","{865: 'Bring on the learning revolution!', 173...",Sir Ken Robinson makes an entertaining and pro...,Good morning. How are you? (Audience) Good. It...


In [6]:
# which talk has the most number of comments? least comments? avg comments?
df[['Comments']].mean() # average number of comments is 239073.907615
max_comments = df.sort_values(by = 'Comments', ascending = False)[:1]
max_comments

Unnamed: 0_level_0,Title,Speaker 1,Occupations,About Speakers,Views,Date Recorded,Date Published,Comments,Duration,Topics,Related Talks,Description,Transcript
Talk ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
113,Militant atheism,Richard Dawkins,{0: ['evolutionary biologist']},{0: 'Oxford professor Richard Dawkins has help...,5788514,2002-02-02,2007-04-16,60404090,00:29:10,"['God', 'atheism', 'culture', 'religion', 'sci...","{86: 'Letting go of God', 94: ""Let's teach rel...",Richard Dawkins urges all atheists to openly s...,"That splendid music, the coming-in music, ""The..."


In [7]:
# which talk has the longest duration?
df.sort_values(by = 'Duration', ascending = False)[:1]

Unnamed: 0_level_0,Title,Speaker 1,Occupations,About Speakers,Views,Date Recorded,Date Published,Comments,Duration,Topics,Related Talks,Description,Transcript
Talk ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
54715,How we can turn the tide on climate,Chris Anderson,"{0: ['head of ted'], 1: ['climate advocate']}",{0: 'After a long career in journalism and pub...,1493370,2019-12-04,2019-12-12,4040,01:05:22,"['climate change', 'environment', 'activism', ...",{32560: 'The disarming case to act right now o...,"Witness the unveiling of Countdown, a major gl...",[Citizens of the world] [We face a global cris...


In [8]:
# transcript analysis; word cloud, sentiment analysis
ted_transcript = df[['Title', 'Speaker 1', 'Transcript']]
ted_transcript

Unnamed: 0_level_0,Title,Speaker 1,Transcript
Talk ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0001,Averting the climate crisis,Al Gore,"Thank you so much, Chris. And it's truly a gre..."
0002,Simple designs to save a life,Amy Smith,"In terms of invention, I'd like to tell you th..."
0003,How to rebuild a broken state,Ashraf Ghani,"A public, Dewey long ago observed, is constitu..."
0004,The real future of space exploration,Burt Rutan,"I want to start off by saying, Houston, we hav..."
0005,Great cars are great art,Chris Bangle,"What I want to talk about is, as background, i..."
...,...,...,...
9985,"""My Fine Reward""",Tito Deler,(Music) Sun shining up above down here it's 10...
9986,This company pays kids to do their math homework,Mohamad Jebara,"For as long as I remember, I've loved mathemat..."
9987,How to fix a broken heart,Guy Winch,"At some point in our lives, almost every one o..."
9988,The surprising ingredient that makes businesse...,Marco Alverà,"For me, it was not being invited to a friend's..."


In [9]:
# SEMI-STRUCTURED DATA PROCESSING

# This program reads in JSON formatted data from a MongoDB collection.
# This is in a format that is structured with lines of data representing one Tweet for Twitter.
# This program contains the data as lists of JSON structures, which are just Python dictionaries and lists.

# START MONGODB
# brew services start mongodb-community@4.4

# SCRAPE TWEETS 2020 0829 
# !python run_twitter_simple_search_save.py "#TED" 4000 ted tedtweets
# !python run_twitter_simple_search_save.py "#tedtalk" 4000 ted tedtweets
# !python run_twitter_simple_search_save.py "#TEDx" 4000 ted tedtweets

# reran and collected more tweets in anticipation of only analyzing tweet text with 'lang'== 'en'

In [10]:
import pymongo
client = pymongo.MongoClient('localhost', 27017)

db = client.ted

db.list_collection_names()

['tedtweets']

In [11]:
collection = db.tedtweets

tweets = collection.find()

tweetlist = [tweet for tweet in tweets]
len(tweetlist) # as of 08/29/2020

4601

In [12]:
 # Here is a little print function that will help.

def print_tweet_data(tweets):
    for tweet in tweets:
        print('\nDate:', tweet['created_at'])
        print('From:', tweet['user']['name'])
        print('Message:', tweet['text'])
        if not tweet['place'] is None:
            print('Place:', tweet['place']['full_name'])

print_tweet_data(tweetlist[:5])


Date: Sat Aug 29 19:07:13 +0000 2020
From: ✨ Petite Quarterback ✨
Message: My college students are making their own Bitmoji classrooms and I absolutely love it! #BMCC #TED
Place: Manhattan, NY

Date: Sat Aug 29 18:48:48 +0000 2020
From: ~Bella~
Message: PARTY ON DUDES.                        #BillAndTed #bill #ted #billandtedfancam #fancam #fancams #billandtedfancams https://t.co/5MyCwhhD0D

Date: Sat Aug 29 18:35:04 +0000 2020
From: polish-hive
Message: Kolonizacja Marsa a praca w sektorze kosmicznym - moje wystąpienie na TEDx 

#hive #posh #Poland #Polska #space… https://t.co/e1KbJDCfGK

Date: Sat Aug 29 17:57:10 +0000 2020
From: TED DiyarbakırKoleji
Message: 30 Ağustos Zafer Bayramı kutlu olsun.
#TED #Diyarbakır #30ağustos https://t.co/hx2yNO9sUc

Date: Sat Aug 29 17:47:50 +0000 2020
From: Présentation Pertinente
Message: Concevoir un support de présentation peut-être comparé à une ossature, comme évoqué dans le guide officiel #TED - P… https://t.co/tnkq63hVG8


In [13]:
# My program contains pandas dataframes for processed data.

# This program does some processing to collect data from some of the fields the questions described below, 
# and write a file with the data suitable for answering each question.

import numpy as np
import pandas as pd

df = pd.DataFrame(tweetlist)

# Test for null values and remove optional fields
df.isna().sum() # sum of NaN

_id                             0
created_at                      0
id                              0
id_str                          0
text                            0
truncated                       0
entities                        0
metadata                        0
source                          0
in_reply_to_status_id        4371
in_reply_to_status_id_str    4371
in_reply_to_user_id          4338
in_reply_to_user_id_str      4338
in_reply_to_screen_name      4338
user                            0
geo                          4593
coordinates                  4593
place                        4514
contributors                 4601
is_quote_status                 0
retweet_count                   0
favorite_count                  0
favorited                       0
retweeted                       0
lang                            0
extended_entities            4380
possibly_sensitive           1788
quoted_status_id             4101
quoted_status_id_str         4101
quoted_status 

In [14]:
df.dropna()

# select columns
df = df[['_id',
         'created_at',
         'text',
         'entities', ### hashtag from entities
         'user', ### name from user
         'retweet_count',
         'favorite_count',
         'lang']]

# set index to _id column
# df = df.set_index('_id')

In [15]:
# convert df[['created_at']] to string; 
df[['created_at']] = df[['created_at']].astype(str)

# convert df[['created_at']] to datetime using pd.to_datetime()
df[['created_at']] = pd.to_datetime(df['created_at'], format = '%a %b %d %H:%M:%S +0000 %Y')

In [16]:
# convert counts from str to int type
df[['retweet_count']] = df[['retweet_count']].astype(int)
df[['favorite_count']] = df[['favorite_count']].astype(int)

In [17]:
# bin Tweets by day
created_date = df['created_at'].dt.date
df['created_date'] = created_date

# bin Tweets by hour
created_hour = df['created_at'].dt.hour
df['created_hour'] = created_hour

In [18]:
# report on the number of Tweets per day
df[['created_date', 'created_at']].groupby(['created_date']).count()

Unnamed: 0_level_0,created_at
created_date,Unnamed: 1_level_1
2020-08-21,426
2020-08-22,784
2020-08-23,588
2020-08-24,492
2020-08-25,507
2020-08-26,604
2020-08-27,425
2020-08-28,442
2020-08-29,333


In [19]:
# What is being Tweeted on 08-22-2020?
df['created_date'] = df['created_date'].astype(str)
max_date = df[(df['created_date'] > '2020-08-21') & (df['created_date'] < '2020-08-23')]
max_date[['created_date', 'text']][:50]

# Do Schools Kill Creativity? - Sir Ken Robinson (1950-03-04 - 2020-08-21)
# https://www.ted.com/talks/sir_ken_robinson_do_schools_kill_creativity?language=en

Unnamed: 0,created_date,text
1021,2020-08-22,"RT @TEDxCDMX: Hoy más que nunca, recordemos es..."
1022,2020-08-22,그럼 어째서 우리는 결코 사무엘 피어퐁 랭리에 대해서 들어본 적이 없는가? \nTh...
1023,2020-08-22,RT @fahadalahmdi: هل تعرف ماهو الفرق بين السؤا...
1024,2020-08-22,"@SirKenRobinson murió ayer 21 agosto 2020, gra..."
1025,2020-08-22,RT @thewetbaguett3: Like for part 7! #ted #mov...
1026,2020-08-22,그들은 없었다. 우리가 성공의 레서피라 여기는 것들을 \nthey had none ...
1027,2020-08-22,Like for part 7! #ted #movie #moviescene #4u #...
1028,2020-08-22,ダニエル・H・コーエン: よい議論をするために\nhttps://t.co/llqGnURP...
1029,2020-08-22,"RT @TEDxCDMX: Hoy más que nunca, recordemos es..."
1030,2020-08-22,If you've never watched @SirKenRobinson's orig...


In [20]:
# report on the number of Tweets per hour
df[['created_at', 'created_hour']].groupby(['created_hour']).count()

Unnamed: 0_level_0,created_at
created_hour,Unnamed: 1_level_1
0,163
1,68
2,98
3,165
4,123
5,102
6,162
7,287
8,170
9,232


In [21]:
# report on the number of Tweets per day, per hour
created_date_hour = df[['created_at', 'created_date', 'created_hour']].groupby(['created_date', 'created_hour']).count()
created_date_hour

max_date[['created_at', 'created_date', 'created_hour']].groupby(['created_date', 'created_hour']).count().sort_values(by = 'created_at', ascending = False)[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,created_at
created_date,created_hour,Unnamed: 2_level_1
2020-08-22,13,90
2020-08-22,20,79
2020-08-22,18,56
2020-08-22,15,54
2020-08-22,14,46
2020-08-22,12,45
2020-08-22,19,44
2020-08-22,17,44
2020-08-22,22,43
2020-08-22,16,42


In [22]:
# 25 different languages represented in this collection
!python twitter_lang.py ted tedtweets twitter_lang_results.csv

Wrote 25 to file


In [23]:
# Using twitter_lang.py as an example, use different fields
!python Raya_Young_twitter_name.py ted tedtweets twitter_name_results.csv
# 2645 unique users

Wrote 2645 to file


In [24]:
# Top 20 Frequency Hashtags
!python twitter_hashtags.py ted tedtweets 20

Top 20 Frequency Hashtags
TedTalk 925
TED 704
Leadership 576
Management 382
TEDx 283
tedx 265
Tedx 210
ted 160
TEDTalk 107
教育 91
Ted 77
動画 74
tedxtcet 70
فهد_الأحمدي 67
تيد 67
صباح_الخير 67
talk 58
IFTHH 56
tedtalk 55
technology 52


In [25]:
# What are the number of English Tweets in this collection?
dfen = df[df['lang']=='en']
len(dfen) # 3418 tweets in English

3418

In [26]:
dfen[['retweet_count']].mean() # The average number of retweets in this collection is 28.8
max_RT = dfen.sort_values(by = 'retweet_count', ascending = False)[:1] 
max_RT # The maximum number of retweets in this collection is 1235

# Sandeep Ahlawat, Lieutenant Colonel of Indian Army - https://www.youtube.com/watch?v=8wU-cK9G4V8
# https://twitter.com/SandyAhlawat89/status/1175335660162433027

Unnamed: 0,_id,created_at,text,entities,user,retweet_count,favorite_count,lang,created_date,created_hour
4600,5f4aa89bd6a56ce85825ae65,2020-08-21 05:44:10,RT @SandyAhlawat89: When civilian countrymen a...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 2880746036, 'id_str': '2880746036', 'na...",1235,0,en,2020-08-21,5


In [27]:
dfen[['favorite_count']].mean() # The average number of favorites in this collection is 2
max_fave = dfen.sort_values(by = 'favorite_count', ascending = False)[:1]
max_fave # The maximum number of favorites in this collection is 818

Unnamed: 0,_id,created_at,text,entities,user,retweet_count,favorite_count,lang,created_date,created_hour
2920,5f4aa86f678bd9aa94f14169,2020-08-22 12:45:33,Good morning. If you haven’t watched my #TedTa...,"{'hashtags': [{'text': 'TedTalk', 'indices': [...","{'id': 17375057, 'id_str': '17375057', 'name':...",286,818,en,2020-08-22,12


In [28]:
# What are the shared features of the top 1000 most popular Retweets?
topRT = dfen.sort_values(by = 'retweet_count', ascending = False).head(1000) # 1/3 of collection for training
topRT

Unnamed: 0,_id,created_at,text,entities,user,retweet_count,favorite_count,lang,created_date,created_hour
4600,5f4aa89bd6a56ce85825ae65,2020-08-21 05:44:10,RT @SandyAhlawat89: When civilian countrymen a...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 2880746036, 'id_str': '2880746036', 'na...",1235,0,en,2020-08-21,5
3298,5f4aa89bd6a56ce85825a94f,2020-08-28 21:57:27,RT @AttorneyGriggs: “From Civil Rights to Soci...,"{'hashtags': [{'text': 'justicefighter', 'indi...","{'id': 32931825, 'id_str': '32931825', 'name':...",492,0,en,2020-08-28,21
2821,5f4aa86f678bd9aa94f14106,2020-08-22 14:25:13,RT @hmcghee: Good morning. If you haven’t watc...,"{'hashtags': [{'text': 'TedTalk', 'indices': [...","{'id': 16586846, 'id_str': '16586846', 'name':...",286,0,en,2020-08-22,14
2804,5f4aa86f678bd9aa94f140f5,2020-08-22 15:14:06,RT @hmcghee: Good morning. If you haven’t watc...,"{'hashtags': [{'text': 'TedTalk', 'indices': [...","{'id': 1476103248, 'id_str': '1476103248', 'na...",286,0,en,2020-08-22,15
2805,5f4aa86f678bd9aa94f140f6,2020-08-22 15:12:45,RT @hmcghee: Good morning. If you haven’t watc...,"{'hashtags': [{'text': 'TedTalk', 'indices': [...","{'id': 1454712913, 'id_str': '1454712913', 'na...",286,0,en,2020-08-22,15
...,...,...,...,...,...,...,...,...,...,...
3600,5f4aa89bd6a56ce85825aa7d,2020-08-27 03:35:21,RT @LollyDaskal: Fear and focus are the only t...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 198787129, 'id_str': '198787129', 'name...",8,0,en,2020-08-27,3
2210,5f4aa86f678bd9aa94f13ea3,2020-08-24 12:47:09,RT @LollyDaskal: Our life is a reflection of t...,"{'hashtags': [{'text': 'Leadership', 'indices'...","{'id': 902154554216783872, 'id_str': '90215455...",8,0,en,2020-08-24,12
2208,5f4aa86f678bd9aa94f13ea1,2020-08-24 12:56:02,RT @LollyDaskal: Our life is a reflection of t...,"{'hashtags': [{'text': 'Leadership', 'indices'...","{'id': 1403356447, 'id_str': '1403356447', 'na...",8,0,en,2020-08-24,12
3080,5f4aa86f678bd9aa94f14209,2020-08-21 12:59:01,RT @LollyDaskal: A bad attitude is like a flat...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 1403356447, 'id_str': '1403356447', 'na...",8,0,en,2020-08-21,12


In [29]:
# consolidated repeated RTs
topRT = dfen[['text', 'retweet_count']].groupby(topRT['text']).sum().sort_values(by = 'retweet_count', ascending = False)
topRT

# Racism Has a Cost for Everyone - Heather C. McGee
# https://www.ted.com/talks/heather_c_mcghee_racism_has_a_cost_for_everyone?utm_source=t.co&utm_content=2020-8-20&utm_medium=referral&utm_campaign=social

Unnamed: 0_level_0,retweet_count
text,Unnamed: 1_level_1
"RT @hmcghee: Good morning. If you haven’t watched my #TedTalk, please do — and let me know what you think.",50050
"RT @hmcghee: Good morning. If you haven’t watched my #TedTalk, please do — and let me know what you think. https://t.co/1sOAaRf0Ng",15730
RT @maysoonzayid: TALK TO THE #DISABLED PERSON NOT THE NON DISABLED PARENTS. Thanks for coming to my #TedTalk,6810
RT @DuncanJWardle: Which is your favorite place to #ThinkDifferent? Mine is at my @TEDx talk on the Theory of #Creativity at @Tedx_AUK: htt…,2288
"RT @EmergMedDr: If you get bit by an insect , it stings and hurts but you don't need to seek help straight away in an A&amp;E.\n\nBuy some antihi…",1634
...,...
RT @LollyDaskal: A caring attitude is one of the great leadership qualities of a great leader.\n~@LollyDaskal https://t.co/ppfSHzIgez #Leade…,8
RT @LollyDaskal: You don’t INSPIRE OTHERS by speaking about how amazing you are.\n\nYou INSPIRE OTHERS by showing them how amazing THEY ARE.…,8
"RT @TEDxTCET: Even in these difficult times, a sister's love always shines. Here's to a quarantine Raksha Bandhan! #tedxtcet #tedxtcet2020…",8
A caring attitude is one of the great leadership qualities of a great leader.\n~@LollyDaskal https://t.co/ppfSHzIgez… https://t.co/jFj35CfF76,8


In [30]:
# What are the number of English Tweets in this collection?
dfen = df[df['lang']=='en']
len(dfen) # 3418 tweets in English

3418

In [31]:
# Text Tokenization
import nltk

client = pymongo.MongoClient('localhost', 27017)

db = client.ted

db.list_collection_names()

['tedtweets']

In [32]:
collection = db.tedtweets

tweets = collection.find()

tweetlist = list(tweets)

textlist = [tweet['text'] for tweet in tweetlist if 'text' in tweet.keys()]
len(textlist)

4601

In [33]:
all_tokens = [tok for text in textlist for tok in nltk.word_tokenize(text)]
len(all_tokens) #119204
all_tokens[:50]

['My',
 'college',
 'students',
 'are',
 'making',
 'their',
 'own',
 'Bitmoji',
 'classrooms',
 'and',
 'I',
 'absolutely',
 'love',
 'it',
 '!',
 '#',
 'BMCC',
 '#',
 'TED',
 'PARTY',
 'ON',
 'DUDES',
 '.',
 '#',
 'BillAndTed',
 '#',
 'bill',
 '#',
 'ted',
 '#',
 'billandtedfancam',
 '#',
 'fancam',
 '#',
 'fancams',
 '#',
 'billandtedfancams',
 'https',
 ':',
 '//t.co/5MyCwhhD0D',
 'Kolonizacja',
 'Marsa',
 'a',
 'praca',
 'w',
 'sektorze',
 'kosmicznym',
 '-',
 'moje',
 'wystąpienie']

In [34]:
textFD = nltk.FreqDist(all_tokens)
textFD.most_common(30)

[('#', 9433),
 (':', 6521),
 ('@', 4530),
 ('https', 3466),
 ('RT', 2665),
 ('.', 2662),
 ('LollyDaskal', 1894),
 (',', 1886),
 ('you', 1604),
 ('to', 1531),
 ('a', 1243),
 ('the', 1238),
 ('and', 978),
 ('is', 956),
 ('of', 939),
 ('TedTalk', 908),
 ('//t.co/ppfSHzIgez', 824),
 ('’', 800),
 ('~', 800),
 ('TED', 787),
 ('in', 635),
 ('what', 593),
 ('my', 578),
 ('for', 552),
 ('!', 549),
 ('do', 534),
 ('Leadership', 515),
 ('I', 456),
 ('de', 441),
 ('t', 431)]

In [35]:
import re
def alpha_filter(w):
    pattern = re.compile('^[^a-z]+$')
    if (pattern.match(w)):
        return True
    else: 
        return False
    
token_list = [tok for tok in all_tokens if not alpha_filter(tok)]
token_list[:30]

['My',
 'college',
 'students',
 'are',
 'making',
 'their',
 'own',
 'Bitmoji',
 'classrooms',
 'and',
 'absolutely',
 'love',
 'it',
 'BillAndTed',
 'bill',
 'ted',
 'billandtedfancam',
 'fancam',
 'fancams',
 'billandtedfancams',
 'https',
 '//t.co/5MyCwhhD0D',
 'Kolonizacja',
 'Marsa',
 'a',
 'praca',
 'w',
 'sektorze',
 'kosmicznym',
 'moje']

In [36]:
textFD = nltk.FreqDist(token_list)

top_words = textFD.most_common(30)

for word, freq in top_words:
    print(word, freq)

https 3466
LollyDaskal 1894
you 1604
to 1531
a 1243
the 1238
and 978
is 956
of 939
TedTalk 908
//t.co/ppfSHzIgez 824
in 635
what 593
my 578
for 552
do 534
Leadership 515
de 441
t 431
your 424
on 405
TEDx 375
If 369
know 367
Management 361
it 347
have 342
are 337
me 334
from 317


In [37]:
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk_stopwords = nltk.corpus.stopwords.words('english')
nltk_stopwords

# transcript analysis; word cloud, sentiment analysis
max_views_script = max_views['Transcript'] # Do Schools Kill Creativity? - Sir Ken Robinson

max_comments_script = max_comments['Transcript'] # Militant Atheism - Richard Dawkins

In [38]:
views_tokens = max_views_script.apply(word_tokenize)
views_tokens_list = [word for word in views_tokens if word not in nltk_stopwords]
views_tokens_list

[['Good',
  'morning',
  '.',
  'How',
  'are',
  'you',
  '?',
  '(',
  'Audience',
  ')',
  'Good',
  '.',
  'It',
  "'s",
  'been',
  'great',
  ',',
  'has',
  "n't",
  'it',
  '?',
  'I',
  "'ve",
  'been',
  'blown',
  'away',
  'by',
  'the',
  'whole',
  'thing',
  '.',
  'In',
  'fact',
  ',',
  'I',
  "'m",
  'leaving',
  '.',
  '(',
  'Laughter',
  ')',
  'There',
  'have',
  'been',
  'three',
  'themes',
  'running',
  'through',
  'the',
  'conference',
  ',',
  'which',
  'are',
  'relevant',
  'to',
  'what',
  'I',
  'want',
  'to',
  'talk',
  'about',
  '.',
  'One',
  'is',
  'the',
  'extraordinary',
  'evidence',
  'of',
  'human',
  'creativity',
  'in',
  'all',
  'of',
  'the',
  'presentations',
  'that',
  'we',
  "'ve",
  'had',
  'and',
  'in',
  'all',
  'of',
  'the',
  'people',
  'here',
  ';',
  'just',
  'the',
  'variety',
  'of',
  'it',
  'and',
  'the',
  'range',
  'of',
  'it',
  '.',
  'The',
  'second',
  'is',
  'that',
  'it',
  "'s",
  'put

In [39]:
from textblob import TextBlob

text = str(views_tokens_list)

blob = TextBlob(text)
blob.tags           # [('The', 'DT'), ('titular', 'JJ'),
                    #  ('threat', 'NN'), ('of', 'IN'), ...]

blob.noun_phrases   # WordList(['titular threat', 'blob',
                    #            'ultimate movie monster',
                    #            'amoeba-like mass', ...])

for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

0.7
0.0
0.7
0.8
0.2
0.0
0.4
0.16666666666666666
0.0
0.0
0.0
0.0
0.0
0.65
0.0
0.0
-0.05
-0.05
0.0
0.0
0.0
0.0
0.0
0.14285714285714285
-0.125
0.0
0.20000000000000004
0.0
-0.25
0.0
0.3333333333333333
0.16666666666666666
-0.15625
0.0
0.5111111111111111
0.3333333333333333
-0.13888888888888892
0.0
0.2
0.0
0.0
0.26
0.0
0.37083333333333335
-0.14583333333333334
0.0
0.0
0.0
0.0
0.0
0.3
0.0
0.0
0.0
0.0
0.0
0.0
0.6
0.0
0.4375
0.0
0.0
0.2
0.15625
0.0
-0.5
0.0
0.0
0.04722222222222221
0.0
0.0
-0.3
0.0
0.2857142857142857
-0.5
-0.078125
-0.20833333333333334
0.5
-0.5
0.0
0.0
-1.0
0.5
0.0
0.0
-0.05
0.0
0.0
0.0
0.0
0.1
0.0
0.13636363636363635
0.0
0.0
0.0
0.0
0.0
0.0
-0.3125
0.0
-0.8
-0.1
0.0
0.0
-0.19444444444444448
0.0
-0.3
0.0
0.0
0.0
0.0
0.7
0.5
0.0
-0.025
0.2
0.0
0.3333333333333333
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.225
0.2
0.0
0.0
0.0
0.4
0.52
0.0
0.0
0.0
0.5
0.0
-0.16666666666666666
-0.125
0.3
0.0
0.5
0.0
0.0
0.0
0.0
-0.1
-0.1
-0.015151515151515152
-0.015151515151515152
0.6
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.

In [40]:
comments_tokens = max_comments_script.apply(word_tokenize)
comments_tokens_list = [word for word in comments_tokens if word not in nltk_stopwords]
comments_tokens_list

[['That',
  'splendid',
  'music',
  ',',
  'the',
  'coming-in',
  'music',
  ',',
  '``',
  'The',
  'Elephant',
  'March',
  "''",
  'from',
  '``',
  'Aida',
  ',',
  "''",
  'is',
  'the',
  'music',
  'I',
  "'ve",
  'chosen',
  'for',
  'my',
  'funeral',
  '.',
  '(',
  'Laughter',
  ')',
  'And',
  'you',
  'can',
  'see',
  'why',
  '.',
  'It',
  "'s",
  'triumphal',
  '.',
  'I',
  'wo',
  "n't",
  'feel',
  'anything',
  ',',
  'but',
  'if',
  'I',
  'could',
  ',',
  'I',
  'would',
  'feel',
  'triumphal',
  'at',
  'having',
  'lived',
  'at',
  'all',
  ',',
  'and',
  'at',
  'having',
  'lived',
  'on',
  'this',
  'splendid',
  'planet',
  ',',
  'and',
  'having',
  'been',
  'given',
  'the',
  'opportunity',
  'to',
  'understand',
  'something',
  'about',
  'why',
  'I',
  'was',
  'here',
  'in',
  'the',
  'first',
  'place',
  ',',
  'before',
  'not',
  'being',
  'here',
  '.',
  'Can',
  'you',
  'understand',
  'my',
  'quaint',
  'English',
  'accent',

In [41]:
text = str(comments_tokens_list)

blob = TextBlob(text)
blob.tags           # [('The', 'DT'), ('titular', 'JJ'),
                    #  ('threat', 'NN'), ('of', 'IN'), ...]

blob.noun_phrases   # WordList(['titular threat', 'blob',
                    #            'ultimate movie monster',
                    #            'amoeba-like mass', ...])

for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

0.8333333333333334
0.0
0.0
0.5416666666666667
0.0
0.0
0.35
0.2916666666666667
-0.20625
-0.022222222222222216
0.05
0.10714285714285714
0.3
0.0
-0.3
0.36767676767676766
0.0
0.13636363636363635
0.0
-0.013888888888888895
0.6
0.0
0.42000000000000004
-0.1777777777777778
0.5000000000000001
0.0
-0.125
0.6
-0.1
0.2857142857142857
0.2857142857142857
0.16666666666666666
0.11666666666666665
0.4
0.5
0.0
-0.3
0.04484848484848485
-0.25
0.7
0.0
-0.3
0.0
-0.02083333333333333
0.06666666666666667
0.3
0.2
0.3
0.85
-0.6999999999999998
0.0
0.2857142857142857
0.2
0.04107142857142857
0.0
0.0
0.7
0.0
-0.25
0.0
0.21428571428571427
-0.6999999999999998
0.0
0.0
0.0
1.0
0.15000000000000002
0.0
0.2
-0.0625
0.0
0.0
0.4666666666666666
0.0
0.5
-0.16666666666666666
0.35714285714285715
0.016666666666666663
-0.25
0.0
0.07777777777777777
-0.1
-0.1
0.06666666666666665
0.0
0.0
0.0
0.0
-0.5
0.0
0.0
0.0
0.0
0.2
0.5
-0.25
0.0
0.0
-0.25
0.0
0.15555555555555556
0.5
0.0
0.3666666666666667
0.6
0.0
0.04999999999999999
-0.15454545454