# Capstone Project 1 : TED talk analysis

Springboard Data Science Career Track ; 
Author : Pavan Poosarla, pavanpoosarla01@gmail.com


Start Date : 9/ 10/ 2019
Description :
As a part of the first capstone project, I will be analysing TED talk transcripts and analyse the sentiment

Date Source
https://www.kaggle.com/rounakbanik/ted-talks/downloads/ted-talks.zip/3

## Part 1 : Data Wrangling

In [64]:
# Import packages
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from tqdm import tqdm


[nltk_data] Downloading package punkt to C:\Users\Pavan
[nltk_data]     Anirudh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [65]:
# Define function to check for missing values
def CheckMissing_df(df):
    '''Function to remove missing values from the dataframe. It does the following
    Print the number of missing values in each columns
    '''
    missing_dict = {'ColName' : [], 'MissingNumber':[]}
    df_cols = df.columns
    for item in df_cols:
        missing_dict['ColName'].append(item)
        missing_dict['MissingNumber'].append(df[item].isna().sum())
    missing_df = pd.DataFrame(missing_dict)
    return missing_df

In [66]:
# Define function to count the number of sentences in a text
def CountSentence(text):
    '''Fuction to count the number of sentences in a text in a dataframe'''
    sen_list = text.split('.')
    sen_list = [x for x in sen_list if x != '']
    return len(sen_list)

In [67]:
def AudReaction(text):
    '''Function to extract all audience reactions provided within the transcripts as a list'''
    openn = text.split('(')
    aud_react_list = []
    for txt in openn:
        closee = txt.split(')')
        if len(closee)==2:
            aud_react_list.append(closee[0])
    aud_react_count = len(aud_react_list)
    aud_react_dict = dict((x,aud_react_list.count(x)) for x in set(aud_react_list))
    return aud_react_dict   

In [68]:
def WordCount (text):
    '''Function to extract word count from a transcript after eliminating text in parenthesis'''
    sen_list = text.split('.')
    sen_list = [x for x in sen_list if x != '']
    len(sen_list)
    clean_sen = []
    for sen in sen_list:
        openn = sen.split('(')
        if len(openn) == 1:
            clean_sen.append(sen)
        else:
            for op in openn:
                closee = op.split(')')
                if len(closee) == 2:
                    closee = closee[1]
                elif len(closee) == 1:
                    closee = closee[0]
                else:
                    # split_2way = op.split(')',2)  
                    # closee = split_2way[1]
                    closee = ')'.join(closee)
                    # print ('error:', op)
                    # print (closee)
            clean_sen.append(closee)
        # print (sen)
        # print (close)
    word_count = sum([len(x.split(' ')) for x in clean_sen])
    return word_count

In [69]:
# Functon for text normalization
def text_normalize(text):
    normalized_text = text
    # Strip leading and lagging whitespace
    normalized_text = normalized_text.strip()
    # Convert all text to lower case
    normalized_text.lower()
    # Remove punctuation
    normalized_text.translate( str.maketrans('','', string.punctuation))
    # Word tokenization
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(normalized_text)
    result = [i for i in tokens if not i in stop_words]
    stemmer = PorterStemmer()
    stemmed_result = []
    for word in result:
        stemmed_result.append(stemmer.stem(word))
    return stemmed_result

In [70]:
# Read data from the Kaggle TED talk database
df_main = pd.read_csv(r'ted_main.csv', parse_dates = ['film_date', 'published_date'])
df_transcripts = pd.read_csv(r'transcripts.csv')

In [71]:
df_main.head(2)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520


In [72]:
df_main.shape

(2550, 17)

In [73]:
df_transcripts.shape

(2467, 2)

In [74]:
df_transcripts.head(2)

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...


In [75]:
# Join both databases
# lets work with those combined dataframe from both files
df_joined = pd.merge (df_main, df_transcripts, how = 'inner', on = 'url')
print ('Combined Dataframe is read')
print ('Raw dataset has %d columns and %d talks'%(df_joined.shape[1], df_joined.shape[0]))

Combined Dataframe is read
Raw dataset has 18 columns and 2467 talks


In [76]:
# Look for any missing data
missing_df = CheckMissing_df(df_joined)
print ('Columns with missing values are \n', missing_df[missing_df.MissingNumber != 0])

Columns with missing values are 
                ColName  MissingNumber
12  speaker_occupation              6


In [77]:
# Looks like speaker occupation is not available for 6 talks
# Fill Missing values
df_joined['speaker_occupation'].fillna('Unknown', inplace = True)
print ('After filling, # missing in speaker occupation is', df_joined['speaker_occupation'].isna().sum())

After filling, # missing in speaker occupation is 0


## Building a clean dataset

In [78]:
df_clean = df_joined

In [79]:
# Drop talks with more than 1 speaker
df_clean = df_joined[df_joined.num_speaker == 1] # To avoid convoluting the analysis

In [80]:
# Drop the columns that are not meaningful to the analysis
df_clean = df_clean.drop(columns = ['related_talks', 'languages', 'url','event','name','speaker_occupation','views', 'num_speaker'])
print ('After dropping extra columns, dataset has', df_clean.columns)

After dropping extra columns, dataset has Index(['comments', 'description', 'duration', 'film_date', 'main_speaker',
       'published_date', 'ratings', 'tags', 'title', 'transcript'],
      dtype='object')


In [81]:
df_clean.dtypes

comments           int64
description       object
duration           int64
film_date         object
main_speaker      object
published_date    object
ratings           object
tags              object
title             object
transcript        object
dtype: object

In [82]:
# Convert film_date into human readable format
df_clean['film_datestamp'] = pd.to_datetime(df_clean['film_date'],unit='s')
df_clean.film_datestamp.head(3)

0   2006-02-25
1   2006-02-25
2   2006-02-24
Name: film_datestamp, dtype: datetime64[ns]

In [83]:
# Do the same date conversaion with published datetime
df_clean['pub_datestamp'] = pd.to_datetime(df_clean['published_date'], unit = 's')
df_clean.pub_datestamp.head(3)

0   2006-06-27 00:11:00
1   2006-06-27 00:11:00
2   2006-06-27 00:11:00
Name: pub_datestamp, dtype: datetime64[ns]

# Text Analysis

1. Sentence Counter
Defining the number of sentence talks in spoken english needs some thought. Turns out, we do not speak in perfecty grammmatical sentences. So, we basically look at number of pauses in the talk to get an estimation of number of sentences. While this choice may be deatable from a purely grammatical standpoint, it lets us know where the speaker paused. As that may make the speech understandable, we will just count in thus way. 


In [84]:
# Build a sentence counter
df_clean['sentence_count'] = df_clean['transcript'].apply(CountSentence)

In [85]:
# Build a dictionary of audience reaction throughout the talk
df_clean['aud_reaction_dict'] = df_clean['transcript'].apply(AudReaction)


In [86]:
# Lets build a word counter
df_clean['word_count'] = df_clean['transcript'].apply(WordCount)
# df_clean

In [87]:
df_clean.head(3)

Unnamed: 0,comments,description,duration,film_date,main_speaker,published_date,ratings,tags,title,transcript,film_datestamp,pub_datestamp,sentence_count,aud_reaction_dict,word_count
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,1140825600,Ken Robinson,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,Good morning. How are you?(Laughter)It's been ...,2006-02-25,2006-06-27 00:11:00,223,"{'Laughter': 39, 'Applause': 4}",3172
1,265,With the same humor and humanity he exuded in ...,977,1140825600,Al Gore,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","['alternative energy', 'cars', 'climate change...",Averting the climate crisis,"Thank you so much, Chris. And it's truly a gre...",2006-02-25,2006-06-27 00:11:00,141,"{'Applause': 6, 'Laughter': 22, 'Mock sob': 2}",2081
2,124,New York Times columnist David Pogue takes aim...,1286,1140739200,David Pogue,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","['computers', 'entertainment', 'interface desi...",Simplicity sells,"(Music: ""The Sound of Silence,"" Simon & Garfun...",2006-02-24,2006-06-27 00:11:00,250,"{'Piano music': 1, 'Music: ""The Sound of Silen...",3375


In [88]:
df_clean.dtypes

comments                      int64
description                  object
duration                      int64
film_date                    object
main_speaker                 object
published_date               object
ratings                      object
tags                         object
title                        object
transcript                   object
film_datestamp       datetime64[ns]
pub_datestamp        datetime64[ns]
sentence_count                int64
aud_reaction_dict            object
word_count                    int64
dtype: object

## Creating a Audience reaction Columns
To create audience reaction columns, we do the following
1. Extract audience reactions from transcript and have them as a dictionary
2. Convert the dictionary to a dataframe, resulting in a sparsely populated df with several columns
3. Consolidate this df by merging sparse columns and filling in misisng values
4. Reduce number of columns in this way and append to clean dataframe

In [210]:
# Example of dict element in audience reaction column
df_clean.loc[2,'aud_reaction_dict']

{'Piano music': 1,
 'Music: "The Sound of Silence," Simon & Garfunkel': 1,
 'Music': 2,
 'Laughter': 38,
 'Applause': 12}

In [211]:
# Convert the dictionary of audience reactions into separate columns for aud reactions
df_aud_react = df_clean['aud_reaction_dict'].apply(pd.Series)
df_aud_react.head(3)

Unnamed: 0,Laughter,Applause,Mock sob,Piano music,"Music: ""The Sound of Silence,"" Simon & Garfunkel",Music,Audience whistles,Applause ends,Applause continues,Cheering,...,Distorted voice,Speaks more loudly,Khmer,Singing in Arabic,Audience guesses,Cracking sound,Audience murmurs,Audience responds,offscreen,A capella singing
0,39.0,4.0,,,,,,,,,...,,,,,,,,,,
1,22.0,6.0,2.0,,,,,,,,...,,,,,,,,,,
2,38.0,12.0,,1.0,1.0,2.0,,,,,...,,,,,,,,,,


As can be seen, there are 791 unique audience reactions, which need to be brought down

In [212]:
df_aud_clean = df_aud_react

In [213]:
# Change all column names to lower case
col_names = df_aud_clean.columns
new_col_names = [X.lower() for X in list(col_names)]
df_aud_clean.columns = new_col_names

In [214]:
df_aud_clean.columns

Index(['laughter', 'applause', 'mock sob', 'piano music',
       'music: "the sound of silence," simon & garfunkel', 'music',
       'audience whistles', 'applause ends', 'applause continues', 'cheering',
       ...
       'distorted voice', 'speaks more loudly', 'khmer', 'singing in arabic',
       'audience guesses', 'cracking sound', 'audience murmurs',
       'audience responds', 'offscreen', 'a capella singing'],
      dtype='object', length=791)

In [218]:
# df_aud_clean.describe()

In [219]:
df_aud_clean.shape

(2412, 27)

In [221]:
df_aud_clean = df_aud_clean.dropna(axis = 1, thresh = 10)
df_aud_clean.shape

(2412, 12)

By dropping all columns which have less than 10 values, we drop the column count from 791 to 10. Lets look at the columns

In [222]:
df_aud_clean.describe()

Unnamed: 0,laughter,applause,music,applause ends,cheering,sighs,video,singing,music ends,laughs,audio,cheers
count,1767.0,2288.0,205.0,27.0,15.0,10.0,167.0,16.0,16.0,39.0,11.0,28.0
mean,5.615733,2.291958,2.780488,1.481481,1.133333,1.6,2.083832,2.5,2.5625,1.179487,5.090909,1.357143
std,6.567969,1.987826,2.951505,1.014145,0.351866,1.577621,1.961722,2.804758,1.672075,0.50637,5.769827,0.621485
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0
50%,4.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.5,1.0,3.0,1.0
75%,7.0,3.0,3.0,1.5,1.0,1.0,3.0,2.25,3.25,1.0,4.5,2.0
max,74.0,18.0,20.0,5.0,2.0,6.0,14.0,9.0,7.0,3.0,17.0,3.0


Further consolidating the columns


In [223]:
df_aud_clean = df_aud_clean.fillna(0)
df_aud_clean.head()


Unnamed: 0,laughter,applause,music,applause ends,cheering,sighs,video,singing,music ends,laughs,audio,cheers
0,39.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,22.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38.0,12.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [224]:
df_aud_clean.shape

(2412, 12)

In [225]:
df_aud_clean['laughter'] = df_aud_clean['laughter'].add(df_aud_clean['laughs'])
df_aud_clean['applause'] = df_aud_clean['applause'].add(df_aud_clean['applause ends'])
df_aud_clean['music'] = df_aud_clean['music'].add(df_aud_clean['music ends'])
df_aud_clean['cheering'] = df_aud_clean['cheering'].add(df_aud_clean['cheers'])
df_aud_clean.describe()

Unnamed: 0,laughter,applause,music,applause ends,cheering,sighs,video,singing,music ends,laughs,audio,cheers
count,2412.0,2412.0,2412.0,2412.0,2412.0,2412.0,2412.0,2412.0,2412.0,2412.0,2412.0,2412.0
mean,4.133085,2.190713,0.253317,0.016584,0.022803,0.006633,0.144279,0.016584,0.016998,0.019071,0.023217,0.015755
std,6.154457,2.049802,1.267634,0.188135,0.186373,0.140942,0.738184,0.30024,0.246336,0.161806,0.505753,0.159584
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,74.0,18.0,27.0,5.0,3.0,6.0,14.0,9.0,7.0,3.0,17.0,3.0


In [226]:
df_aud_clean.drop(columns = ['laughs', 'applause ends', 'music ends','cheers'], inplace = True)


In [230]:
df_aud_clean.shape

(2412, 8)

In [229]:
df_clean.shape

(2412, 16)

In [236]:
df_clean = pd.merge (df_clean, df_aud_clean, how = 'inner', left_index = True, right_index = True)
df_clean.shape

(2412, 24)

## Extract Ratings as columns in df

In [242]:
# See an example element in ratings column
ratings_example = df_clean.loc[2,'ratings']
ratings_example

"[{'id': 7, 'name': 'Funny', 'count': 964}, {'id': 3, 'name': 'Courageous', 'count': 45}, {'id': 9, 'name': 'Ingenious', 'count': 183}, {'id': 1, 'name': 'Beautiful', 'count': 60}, {'id': 21, 'name': 'Unconvincing', 'count': 104}, {'id': 11, 'name': 'Longwinded', 'count': 78}, {'id': 8, 'name': 'Informative', 'count': 395}, {'id': 10, 'name': 'Inspiring', 'count': 230}, {'id': 22, 'name': 'Fascinating', 'count': 166}, {'id': 2, 'name': 'Confusing', 'count': 27}, {'id': 25, 'name': 'OK', 'count': 146}, {'id': 24, 'name': 'Persuasive', 'count': 230}, {'id': 23, 'name': 'Jaw-dropping', 'count': 54}, {'id': 26, 'name': 'Obnoxious', 'count': 142}]"

In [240]:
type(ratings_example)

str

In [293]:
import json
str_list = ratings_example.strip('[]').split('},')
str_list = [(X+'}') for X in str_list if X[-1] != '}']
str_list = [X.replace('\'', '\"') for X in str_list]
dict_list = [json.loads(X) for X in str_list]
dict_list

[{'id': 7, 'name': 'Funny', 'count': 964},
 {'id': 3, 'name': 'Courageous', 'count': 45},
 {'id': 9, 'name': 'Ingenious', 'count': 183},
 {'id': 1, 'name': 'Beautiful', 'count': 60},
 {'id': 21, 'name': 'Unconvincing', 'count': 104},
 {'id': 11, 'name': 'Longwinded', 'count': 78},
 {'id': 8, 'name': 'Informative', 'count': 395},
 {'id': 10, 'name': 'Inspiring', 'count': 230},
 {'id': 22, 'name': 'Fascinating', 'count': 166},
 {'id': 2, 'name': 'Confusing', 'count': 27},
 {'id': 25, 'name': 'OK', 'count': 146},
 {'id': 24, 'name': 'Persuasive', 'count': 230},
 {'id': 23, 'name': 'Jaw-dropping', 'count': 54}]