# Capstone Project 1 : TED talk analysis

Springboard Data Science Career Track ; 
Author : Pavan Poosarla, pavanpoosarla01@gmail.com


Start Date : 9/ 10/ 2019
Description :
As a part of the first capstone project, I will be analysing TED talk transcripts and analyse the sentiment

Date Source
https://www.kaggle.com/rounakbanik/ted-talks/downloads/ted-talks.zip/3

## Part 1 : Data Wrangling

In [1]:
# Import packages
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from tqdm import tqdm


[nltk_data] Downloading package punkt to C:\Users\Pavan
[nltk_data]     Anirudh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define function to check for missing values
def CheckMissing_df(df):
    '''Function to remove missing values from the dataframe. It does the following
    Print the number of missing values in each columns
    '''
    missing_dict = {'ColName' : [], 'MissingNumber':[]}
    df_cols = df.columns
    for item in df_cols:
        missing_dict['ColName'].append(item)
        missing_dict['MissingNumber'].append(df[item].isna().sum())
    missing_df = pd.DataFrame(missing_dict)
    return missing_df

In [111]:
# Define function to count the number of sentences in a text
def CountSentence(text):
    '''Fuction to count the number of sentences in a text in a dataframe'''
    sen_list = text.split('.')
    sen_list = [x for x in sen_list if x != '']
    return len(sen_list)

In [104]:
def AudReaction(text):
    '''Function to extract all audience reactions provided within the transcripts as a list'''
    open = text.split('(')
    aud_react_list = []
    for txt in open:
        close = txt.split(')')
        if len(close)==2:
            aud_react_list.append(close[0])
    aud_react_count = len(aud_react_list)
    aud_react_dict = dict((x,aud_react_list.count(x)) for x in set(aud_react_list))
    return aud_react_dict   

In [105]:
def WordCount (text):
    '''Function to extract word count from a transcript after eliminating text in parenthesis'''
    sen_list = text.split('.')
    sen_list = [x for x in sen_list if x != '']
    len(sen_list)
    clean_sen = []
    for sen in sen_list:
        open = sen.split('(')
        if len(open) == 1:
            clean_sen.append(sen)
        else:
            for op in open:
                close = op.split(')')
                if len(close) == 2:
                    close = close[1]
            clean_sen.append(close)
        # print (sen)
        # print (close)
    word_count = sum([len(x.split(' ')) for x in clean_sen])
    return word_count

In [4]:
# Functon for text normalization
def text_normalize(text):
    normalized_text = text
    # Strip leading and lagging whitespace
    normalized_text = normalized_text.strip()
    # Convert all text to lower case
    normalized_text.lower()
    # Remove punctuation
    normalized_text.translate( str.maketrans('','', string.punctuation))
    # Word tokenization
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(normalized_text)
    result = [i for i in tokens if not i in stop_words]
    stemmer = PorterStemmer()
    stemmed_result = []
    for word in result:
        stemmed_result.append(stemmer.stem(word))
    return stemmed_result

In [5]:
# Read data from the Kaggle TED talk database
df_main = pd.read_csv(r'ted_main.csv', parse_dates = ['film_date', 'published_date'])
df_transcripts = pd.read_csv(r'transcripts.csv')

In [6]:
df_main.head(2)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520


In [7]:
df_main.shape

(2550, 17)

In [8]:
df_transcripts.shape

(2467, 2)

In [9]:
df_transcripts.head(2)

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...


In [10]:
# Join both databases
# lets work with those combined dataframe from both files
df_joined = pd.merge (df_main, df_transcripts, how = 'inner', on = 'url')
print ('Combined Dataframe is read')
print ('Raw dataset has %d columns and %d talks'%(df_joined.shape[1], df_joined.shape[0]))

Combined Dataframe is read
Raw dataset has 18 columns and 2467 talks


In [11]:
# Look for any missing data
missing_df = CheckMissing_df(df_joined)
print ('Columns with missing values are \n', missing_df[missing_df.MissingNumber != 0])

Columns with missing values are 
                ColName  MissingNumber
12  speaker_occupation              6


In [12]:
# Looks like speaker occupation is not available for 6 talks
# Fill Missing values
df_joined['speaker_occupation'].fillna('Unknown', inplace = True)
print ('After filling, # missing in speaker occupation is', df_joined['speaker_occupation'].isna().sum())

After filling, # missing in speaker occupation is 0


## Building a clean dataset

In [13]:
df_clean = df_joined

In [14]:
# Drop talks with more than 1 speaker
df_clean = df_joined[df_joined.num_speaker == 1] # To avoid convoluting the analysis

In [15]:
# Drop the columns that are not meaningful to the analysis
df_clean = df_clean.drop(columns = ['related_talks', 'languages', 'url','event','name','speaker_occupation','views', 'num_speaker'])
print ('After dropping extra columns, dataset has', df_clean.columns)

After dropping extra columns, dataset has Index(['comments', 'description', 'duration', 'film_date', 'main_speaker',
       'published_date', 'ratings', 'tags', 'title', 'transcript'],
      dtype='object')


In [16]:
df_clean.dtypes

comments           int64
description       object
duration           int64
film_date         object
main_speaker      object
published_date    object
ratings           object
tags              object
title             object
transcript        object
dtype: object

In [17]:
# Convert film_date into human readable format
df_clean['film_datestamp'] = pd.to_datetime(df_clean['film_date'],unit='s')
df_clean.film_datestamp.head(3)

0   2006-02-25
1   2006-02-25
2   2006-02-24
Name: film_datestamp, dtype: datetime64[ns]

In [28]:
# Do the same date conversaion with published datetime
df_clean['pub_datestamp'] = pd.to_datetime(df_clean['published_date'], unit = 's')
df_clean.pub_datestamp.head(3)

0   2006-06-27 00:11:00
1   2006-06-27 00:11:00
2   2006-06-27 00:11:00
Name: pub_datestamp, dtype: datetime64[ns]

# Text Analysis

1. Sentence Counter
Defining the number of sentence talks in spoken english needs some thought. Turns out, we do not speak in perfecty grammmatical sentences. So, we basically look at number of pauses in the talk to get an estimation of number of sentences. While this choice may be deatable from a purely grammatical standpoint, it lets us know where the speaker paused. As that may make the speech understandable, we will just count in thus way. 


In [112]:
# Build a sentence counter
df_clean['sentence_count'] = df_clean['transcript'].apply(CountSentence)

In [113]:
# Build a dictionary of audience reaction throughout the talk
df_clean['aud_reaction_dict'] = df_clean['transcript'].apply(AudReaction)


In [117]:
df_clean.head(3)

Unnamed: 0,comments,description,duration,film_date,main_speaker,published_date,ratings,tags,title,transcript,film_datestamp,sentence_count,aud_reaction_count,aud_reaction,aud_reaction_dict,word_count
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,1140825600,Ken Robinson,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,Good morning. How are you?(Laughter)It's been ...,2006-02-25,223,"(54, {'Piano music': 1, 'Applause': 12, 'Music...","(54, {'Piano music': 1, 'Applause': 12, 'Music...","{'Applause': 4, 'Laughter': 39}",3375
1,265,With the same humor and humanity he exuded in ...,977,1140825600,Al Gore,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","['alternative energy', 'cars', 'climate change...",Averting the climate crisis,"Thank you so much, Chris. And it's truly a gre...",2006-02-25,141,"(54, {'Piano music': 1, 'Applause': 12, 'Music...","(54, {'Piano music': 1, 'Applause': 12, 'Music...","{'Mock sob': 2, 'Applause': 6, 'Laughter': 22}",3375
2,124,New York Times columnist David Pogue takes aim...,1286,1140739200,David Pogue,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","['computers', 'entertainment', 'interface desi...",Simplicity sells,"(Music: ""The Sound of Silence,"" Simon & Garfun...",2006-02-24,250,"(54, {'Piano music': 1, 'Applause': 12, 'Music...","(54, {'Piano music': 1, 'Applause': 12, 'Music...","{'Piano music': 1, 'Applause': 12, 'Music': 2,...",3375


In [115]:
# Lets build a word counter
df_clean['word_count'] = df_clean['transcript'].apply(WordCount)
df_clean

AttributeError: 'list' object has no attribute 'split'

In [109]:
test_talk = df_clean.loc[2,'transcript']
test_talk



In [69]:
# Word Counter
sen_list = test_talk.split('.')
sen_list = [x for x in sen_list if x != '']
len(sen_list)

250

In [97]:
clean_sen = []
for sen in sen_list:
    open = sen.split('(')
    if len(open) == 1:
        clean_sen.append(sen)
    else:
        for op in open:
            close = op.split(')')
            if len(close) == 2:
                close = close[1]
        clean_sen.append(close)
        # print (sen)
        # print (close)
len(clean_sen)

250

In [96]:
word_count = sum([len(x.split(' ')) for x in clean_sen])
word_count

3375

In [100]:
WordCount (test_talk)

3375