# Capstone 1 : Analysis, Part 1

In this section, we will apply ML techniques to the transcript and the features we extracted to predict the ratings

In [1]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Import NLP Modules
import spacy
import nltk
import re
import unicodedata
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Pavan
[nltk_data]     Anirudh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Pavan
[nltk_data]     Anirudh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Functon for text normalization

def text_preprocess(text):
    # Strip leading and lagging whitespace
    text = text.strip()
    
    # Remove accented characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Convert all text to lower case
    text.lower()
    
    # Take raw text and remove all audience reactions
    text = re.sub('\((.*?)\)', '', text)
    
    # Expand Contactions
    # Specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    
    # Remove Punctuation
    pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, ' ', text)
    
#     # Word Lemmatization
#     wnl = WordNetLemmatizer()
    
#     nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
#     text = nlp(text)
#     text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    
    return text


#     # Remove punctuation
#     normalized_text.translate( str.maketrans('','', string.punctuation))
#     # Word tokenization
#     stop_words = set(stopwords.words('english'))
#     tokens = word_tokenize(normalized_text)
#     result = [i for i in tokens if not i in stop_words]
#     stemmer = PorterStemmer()
#     stemmed_result = []
#     for word in result:
#         stemmed_result.append(stemmer.stem(word))
#     return stemmed_result

In [3]:
# Read data after statistical analysis
df_clean = pd.read_csv(r'../data/interim/After_StatisticalAnalysis.csv', index_col = 0)
df_clean.index.name = "index"

In [4]:
df_clean.head(2)

Unnamed: 0_level_0,description,duration,event,main_speaker,speaker_occupation,tags,title,transcript,film_datestamp,pub_datestamp,...,Max_rating,ratings_total,word_per_min,tag_technology,tag_science,tag_global issues,tag_culture,tag_design,tag_business,tag_entertainment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,Ken Robinson,Author/educator,"['tag_children', 'tag_creativity', 'tag_cultur...",Do schools kill creativity?,Good morning. How are you?(Laughter)It's been ...,2006-02-25 00:00:00,2006-06-27 00:11:00,...,Funny,49356,163,False,False,False,True,False,False,False
1,With the same humor and humanity he exuded in ...,977,TED2006,Al Gore,Climate advocate,"['tag_alternative energy', 'tag_cars', 'tag_cl...",Averting the climate crisis,"Thank you so much, Chris. And it's truly a gre...",2006-02-25 00:00:00,2006-06-27 00:11:00,...,BadTalk,1797,127,True,True,True,True,False,False,False


Pre-processing
Before training a naive Bayes Classifier to get best predictor words, we will perform text normalization on the transcripts. We will do the following
1. Remove accented characters and special characters
2. Remove comments and audience reactions in parenthesis
2. Make all text lower cased
3. Expand contractions
 

In [5]:
df_clean['clean_transcript'] = df_clean.transcript.apply(text_preprocess)

In [6]:
# Create class mapping
class_dict = ({'Fascinating': 1, 'BadTalk': 2, 'Beautiful':3,'Informative':4, 'Funny':5})

# class_dict = ({'Inspiring': 1, 'Funny' : 2, 'Informative' : 3,'Ingenious' : 4, 
#                  'Beautiful' : 5, 'Confusing' :6, 'Courageous':7, 'Fascinating' : 8, 
#                 'OK' :9 , 'Unconvincing' : 10})
df_clean['MaxRating_Class'] =  df_clean.Max_rating.map(class_dict)

In [7]:
df_clean.groupby(['Max_rating']).count().title

Max_rating
BadTalk        163
Beautiful      296
Fascinating    880
Funny          154
Informative    821
Name: title, dtype: int64

In [8]:
df_clean.head(2)

Unnamed: 0_level_0,description,duration,event,main_speaker,speaker_occupation,tags,title,transcript,film_datestamp,pub_datestamp,...,word_per_min,tag_technology,tag_science,tag_global issues,tag_culture,tag_design,tag_business,tag_entertainment,clean_transcript,MaxRating_Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,Ken Robinson,Author/educator,"['tag_children', 'tag_creativity', 'tag_cultur...",Do schools kill creativity?,Good morning. How are you?(Laughter)It's been ...,2006-02-25 00:00:00,2006-06-27 00:11:00,...,163,False,False,False,True,False,False,False,Good morning How are you It is been great ha...,5
1,With the same humor and humanity he exuded in ...,977,TED2006,Al Gore,Climate advocate,"['tag_alternative energy', 'tag_cars', 'tag_cl...",Averting the climate crisis,"Thank you so much, Chris. And it's truly a gre...",2006-02-25 00:00:00,2006-06-27 00:11:00,...,127,True,True,True,True,False,False,False,Thank you so much Chris And it is truly a gr...,2


In [9]:
df_clean.columns

Index(['description', 'duration', 'event', 'main_speaker',
       'speaker_occupation', 'tags', 'title', 'transcript', 'film_datestamp',
       'pub_datestamp', 'sentence_count', 'word_count', 'applause', 'laughter',
       'music', 'cheering', 'sighs', 'singing', 'video', 'audio', 'Funny',
       'Beautiful', 'Informative', 'Fascinating', 'Inspiring', 'event_type',
       'BadTalk', 'Max_rating', 'ratings_total', 'word_per_min',
       'tag_technology', 'tag_science', 'tag_global issues', 'tag_culture',
       'tag_design', 'tag_business', 'tag_entertainment', 'clean_transcript',
       'MaxRating_Class'],
      dtype='object')

# Write Dataset used for modelling to file

In [10]:
df_clean.to_csv('../data/processed/DataforModelling.csv')

In [11]:
df_clean.shape

(2314, 39)

## Find the best predictors(words) for each ratings

We would like to analyse the transcripts to see which words will be strongly predictive of each of the ratings. We do this by training a Naive Bayes classifier by only considering the transcripts.  

### Define Factors and targets for NB model

In [12]:

# X_transcript = df_clean[['transcript']]

y_Funny = df_clean[['Funny']]
y_Informative = df_clean[['Informative']]
y_Inspiring = df_clean[['Inspiring']]
y_MaxRating = df_clean[['MaxRating_Class']]

In [13]:
# X_transcript.head()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words = stopwords.words('english'), min_df = 2, max_df = 0.98)
vectorizer.fit(df_clean.clean_transcript)
X_transcript = vectorizer.transform(df_clean.clean_transcript)
# X_transcript = X_transcript.todense()

In [15]:
type(X_transcript)

scipy.sparse.csr.csr_matrix

In [16]:
X_transcript.shape

(2314, 32665)

In [17]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.98, max_features=None, min_df=2,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [18]:
# vectorizer.vocabulary_

### Train the Naive Bayes Model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import balanced_accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_transcript, (y_MaxRating), test_size = 0.2)
nb = MultinomialNB(alpha=100)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Calculate accuracy score using balanced accuracy scores
print ('Balanced accuracy score (training)is ', balanced_accuracy_score(nb.predict(X_train), y_train))
print ('Balanced accuracy score (test)is ', balanced_accuracy_score(nb.predict(X_test), y_test))

# print ('Accuracy of model on training set is ', nb.score(X_train, y_train))
# print ('Accuracy of model on test set is ', nb.score(X_test, y_test))


Balanced accuracy score (training)is  0.6169049959546393
Balanced accuracy score (test)is  0.5805830459898795


  y = column_or_1d(y, warn=True)


This model has large difference between training and test because the classes are severely unbalanced. However, we will not optimize it further at this point. Instead we will continue to use it for predicting words indicative of each rating


In [20]:
# Train model with entire dataset
nb = MultinomialNB(alpha = 100)
nb.fit(X_transcript, y_MaxRating)
y_pred = nb.predict(X_test)

print ('Balanced accuracy score (test)is ', balanced_accuracy_score(nb.predict(X_transcript), y_MaxRating))

Balanced accuracy score (test)is  0.6118445746434003


  y = column_or_1d(y, warn=True)


## Training the model with TF-IDF vectorizer instaed of CountVectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(stop_words = stopwords.words('english'), min_df = 2, max_df = 0.98, use_idf = True)
tfidf_vec.fit(df_clean.clean_transcript)
X_transcript_tfidf = tfidf_vec.transform(df_clean.clean_transcript)

In [22]:
# Train Naive Bayes on this now
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_transcript_tfidf, (y_MaxRating), test_size = 0.2)
nb2 = MultinomialNB(alpha=100)
nb2.fit(X_train2, y_train2)
y_pred2 = nb.predict(X_test2)

# Calculate accuracy score using balanced accuracy scores
print ('Balanced accuracy score (training)is ', balanced_accuracy_score(nb2.predict(X_train2), y_train2))
print ('Balanced accuracy score (test)is ', balanced_accuracy_score(nb.predict(X_test2), y_test2))


Balanced accuracy score (training)is  0.6914316702819957
Balanced accuracy score (test)is  0.6406349206349207


  y = column_or_1d(y, warn=True)


We see that we can get much better results on Naive Bayes using TF-IDF vectorization instaed of count vectorization

### Find words indicative of each rating

In [23]:
words = np.array(vectorizer.get_feature_names())

In [24]:
words.shape

(32665,)

In [26]:
x = np.eye(X_transcript.shape[1])


In [34]:
class_dict

{'Fascinating': 1, 'BadTalk': 2, 'Beautiful': 3, 'Informative': 4, 'Funny': 5}

In [35]:
nb.classes_

array([1, 2, 3, 4, 5], dtype=int64)

Now, for each rating, we can find the words with highest probability and lowest probability for a particular rating. 
So, we will look at best and worst predictors of three ratings : Inspiring, Funny and Informative

In [34]:
# For 'Fascinating'
probs_fascinating = nb.predict_proba(x)[:, 0]
ind = np.argsort(probs_fascinating)

good_words = words[ind[-15:]]
bad_words = words[ind[:15]]

good_prob = probs_fascinating[ind[-15:]]
bad_prob = probs_fascinating[ind[:15]]

print("Good words\t     P(Fascinating | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(p))
    
print("Bad words\t     P(Fascinating | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(p))

Good words	     P(Fascinating | word)
              device 0.61
               space 0.62
             objects 0.62
                cell 0.62
               brain 0.62
           structure 0.63
              object 0.63
                 fly 0.63
            computer 0.64
                 dna 0.64
               light 0.65
              robots 0.65
            universe 0.68
               cells 0.68
               robot 0.74
Bad words	     P(Fascinating | word)
               women 0.18
         governments 0.23
              rights 0.23
              sector 0.24
            refugees 0.25
           democracy 0.25
               girls 0.25
           societies 0.25
                 men 0.26
           political 0.26
              global 0.26
          inequality 0.26
               civil 0.26
             country 0.26
        institutions 0.26


In [33]:
# Write a loop to get best and worset predictive words for all classes
probs_dict = {}
for rating in class_dict.keys():
    probs_dict[rating+'_probs'] = nb.predict_proba(x)[:, class_dict[rating]-1]
    ind = np.argsort(probs_dict[rating+'_probs'])
    # Get good and bad words
    good_words = words[ind[-5:]]
    bad_words = words[ind[:5]]
    # Get corresponding probabilities
    good_prob = probs_dict[rating+'_probs'][ind[-5:]]
    bad_prob = probs_dict[rating+'_probs'][ind[:5]]
    
    # Print Best and Worst predictive words
    print("Good words\t     P(%s | word)" %rating)
    for w, p in zip(good_words, good_prob):
        print("{:>20}".format(w), "{:.2f}".format(p))
    
    print("Bad words\t     P(%s | word)" %rating)
    for w, p in zip(bad_words, bad_prob):
        print("{:>20}".format(w), "{:.2f}".format(p))


Good words	     P(Fascinating | word)
               light 0.65
              robots 0.65
            universe 0.68
               cells 0.68
               robot 0.74
Bad words	     P(Fascinating | word)
               women 0.18
         governments 0.23
              rights 0.23
              sector 0.24
            refugees 0.25
Good words	     P(BadTalk | word)
                 god 0.10
              tapirs 0.10
          concussion 0.11
           glamorous 0.11
             glamour 0.12
Bad words	     P(BadTalk | word)
                data 0.02
               brain 0.02
                 two 0.02
               cells 0.02
                 see 0.02
Good words	     P(Beautiful | word)
                song 0.21
                girl 0.21
              poetry 0.22
          compassion 0.22
                 hum 0.22
Bad words	     P(Beautiful | word)
                data 0.02
         information 0.03
             percent 0.03
               brain 0.03
          technology 0.03
Good wo

## Conclusion

As can be seen, we can reasonably predict the class of the popular rating based on the words appearing in the transcript. 
In the next notebook on modelling, we will attempt to develop machine learning models to predict the most popular rating for each talk based on the metadata and the transcripts