In [191]:
import pandas as pd

In [192]:
df = pd.read_csv("data/jre_elon_musk.csv")
df.head()

Unnamed: 0,Timestamp,Speaker,Text
0,[00:00:00],Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T..."
1,[00:00:09],Elon Musk,You're welcome.
2,[00:00:10],Joe Rogan,It's very good to meet you.
3,[00:00:11],Elon Musk,Nice to meet you too.
4,[00:00:12],Joe Rogan,And thanks for not lighting this place on fire.


In [193]:
# Print the first ten lines of text
first_ten = df.head(10).Text.values
for i,text in enumerate(first_ten):
    print(f'{i+1} - {text}')

1 - Ah, ha, ha, ha. Four, three, two, one, boom. Thank you. Thanks for doing this, man. Really appreciate it.
2 - You're welcome.
3 - It's very good to meet you.
4 - Nice to meet you too.
5 - And thanks for not lighting this place on fire.
6 - You're welcome. That's coming later.
7 - How does one, just in the middle of doing all the things you do, create cars, rockets, all the stuff you're doing,constantly innovating, decide to just make a flamethrower? Where do you have the time for that?
8 - Well, the flame, we didn't put a lot of time into the flamethrower. This was an off-the-cuff thing. It's sort of a hobbycompany called the Boring Company, which started out as a joke, and we decided to make a real, and dig a tunnelunder LA. And then, other people asked us to dig tunnels. And so, we said yes in a few cases.
9 - Now, who-
10 - And then, we have a merchandise section that only has one piece of merchandise at a time. And we started off witha cap. And there was only one thing on, whic

In [194]:
# Create a function to convert timestamp into seconds

def convert_timestamp_into_seconds(timestamp):
    #convert string to timedelta, convert into seconds using  total_seconds
    timestamp = pd.Timedelta(timestamp.replace('[','').replace(']','')).total_seconds()
    return timestamp

In [195]:
# Convert the Timestamp column using the function defined above

df["Timestamp"] = df['Timestamp'].apply(convert_timestamp_into_seconds)

In [208]:
df

Unnamed: 0,Timestamp,Speaker,Text,Interval,TextNoPunct,TextNoStopWords
0,0.0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T...",9.0,Ah ha ha ha Four three two one boom Thank you ...,Ah ha ha ha boom Thank Thanks man appreciate
1,9.0,Elon Musk,You're welcome.,1.0,You 're welcome,welcome
2,10.0,Joe Rogan,It's very good to meet you.,1.0,It 's very good to meet you,good meet
3,11.0,Elon Musk,Nice to meet you too.,1.0,Nice to meet you too,Nice meet
4,12.0,Joe Rogan,And thanks for not lighting this place on fire.,1.0,And thanks for not lighting this place on fire,thanks lighting place fire
...,...,...,...,...,...,...
1826,9401.0,Joe Rogan,"I believe it's true too. So, thank you.",2.0,I believe it 's true too So thank you,believe true thank
1827,9403.0,Elon Musk,You're welcome.,1.0,You 're welcome,welcome
1828,9404.0,Joe Rogan,"All you assholes out there, be nice. Be nice, ...",6.0,All you assholes out there be nice Be nice bit...,assholes nice nice bitch right Thank everybody...
1829,9410.0,Elon Musk,"All right, thank you.",1.0,All right thank you,right thank


#### **getting duration of each speech**

In [196]:
df["Interval"] = df.Timestamp.shift(-1) - df.Timestamp

#### there were speech some values with duration of 0, we should make them 1, as it's considered minimum duration

In [197]:
df.loc[df['Interval']==0,'Interval'] = 1

#### some nan value, because of the shift

In [198]:
df['Interval'].fillna(1,inplace=True)

In [199]:
df

Unnamed: 0,Timestamp,Speaker,Text,Interval
0,0.0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T...",9.0
1,9.0,Elon Musk,You're welcome.,1.0
2,10.0,Joe Rogan,It's very good to meet you.,1.0
3,11.0,Elon Musk,Nice to meet you too.,1.0
4,12.0,Joe Rogan,And thanks for not lighting this place on fire.,1.0
...,...,...,...,...
1826,9401.0,Joe Rogan,"I believe it's true too. So, thank you.",2.0
1827,9403.0,Elon Musk,You're welcome.,1.0
1828,9404.0,Joe Rogan,"All you assholes out there, be nice. Be nice, ...",6.0
1829,9410.0,Elon Musk,"All right, thank you.",1.0


In [200]:
df.Speaker.unique()

array(['Joe Rogan', 'Elon Musk', 'Jaime'], dtype=object)

#### speech duration of unique speakers

In [201]:
# Total seconds spoken by Joe Rogan
# Total seconds spoken by Elon Musk
# Total seconds spoken by Jaime
pd.pivot_table(df,values='Interval',index='Speaker',aggfunc='sum')

Unnamed: 0_level_0,Interval
Speaker,Unnamed: 1_level_1
Elon Musk,5075.0
Jaime,45.0
Joe Rogan,4637.0


In [202]:
# Average speaking interval for each person
pd.pivot_table(df,values='Interval',index='Speaker',aggfunc='mean')


Unnamed: 0_level_0,Interval
Speaker,Unnamed: 1_level_1
Elon Musk,5.583058
Jaime,2.647059
Joe Rogan,5.123757


Who speaks faster?

Tokenize the text, preprocess the tokens so that you have only words (excluding punctuation) and compute the velocity of each speaker as: number of words per interval / length of interval . Store the result in a column named Velocity and compute the average for each speaker.

In [203]:
# Preprocess the data
import spacy
nlp = spacy.load("en_core_web_sm")
# Create a function to remove punctuation from text

def remove_punctuation(text):
    res =[]
    doc = nlp(text)
    for token in doc:
        if token.pos_ != 'PUNCT':
            res.append(token.text)
    return ' '.join(res)


# # Create a function to count the non punctuation token of a text
def count_tokens(text):
    doc = nlp(text)
    return len(doc)

# # Create a function to remove stop words from text
def remove_stopwords(text):
    # your code here
    doc = nlp(text)
    result = [token.text for token in doc if not token.is_stop]
    return ' '.join(result)



### punctuation removal

In [204]:
df["TextNoPunct"] = df['Text'].apply(remove_punctuation)

In [205]:
df

Unnamed: 0,Timestamp,Speaker,Text,Interval,TextNoPunct
0,0.0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T...",9.0,Ah ha ha ha Four three two one boom Thank you ...
1,9.0,Elon Musk,You're welcome.,1.0,You 're welcome
2,10.0,Joe Rogan,It's very good to meet you.,1.0,It 's very good to meet you
3,11.0,Elon Musk,Nice to meet you too.,1.0,Nice to meet you too
4,12.0,Joe Rogan,And thanks for not lighting this place on fire.,1.0,And thanks for not lighting this place on fire
...,...,...,...,...,...
1826,9401.0,Joe Rogan,"I believe it's true too. So, thank you.",2.0,I believe it 's true too So thank you
1827,9403.0,Elon Musk,You're welcome.,1.0,You 're welcome
1828,9404.0,Joe Rogan,"All you assholes out there, be nice. Be nice, ...",6.0,All you assholes out there be nice Be nice bit...
1829,9410.0,Elon Musk,"All right, thank you.",1.0,All right thank you


### clearing stop words

In [206]:
df['TextNoStopWords'] = df.TextNoPunct.apply(remove_stopwords)

### counting tokens

In [207]:
df['n_tokens'] = df.TextNoPunct.apply(count_tokens)

KeyboardInterrupt: 

In [None]:
df['noTokensWOStopWords']  =df.TextNoStopWords.apply(count_tokens)

In [None]:
df

Unnamed: 0,Timestamp,Speaker,Text,Interval,TextNoPunct,TextNoStopWords,n_tokens,noTokensWOStopWords
0,0.0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T...",9.0,Ah ha ha ha Four three two one boom Thank you ...,Ah ha ha ha boom Thank Thanks man appreciate,19,9
1,9.0,Elon Musk,You're welcome.,1.0,You 're welcome,welcome,3,1
2,10.0,Joe Rogan,It's very good to meet you.,1.0,It 's very good to meet you,good meet,7,2
3,11.0,Elon Musk,Nice to meet you too.,1.0,Nice to meet you too,Nice meet,5,2
4,12.0,Joe Rogan,And thanks for not lighting this place on fire.,1.0,And thanks for not lighting this place on fire,thanks lighting place fire,9,4
...,...,...,...,...,...,...,...,...
1826,9401.0,Joe Rogan,"I believe it's true too. So, thank you.",2.0,I believe it 's true too So thank you,believe true thank,9,3
1827,9403.0,Elon Musk,You're welcome.,1.0,You 're welcome,welcome,3,1
1828,9404.0,Joe Rogan,"All you assholes out there, be nice. Be nice, ...",6.0,All you assholes out there be nice Be nice bit...,assholes nice nice bitch right Thank everybody...,18,9
1829,9410.0,Elon Musk,"All right, thank you.",1.0,All right thank you,right thank,4,2


In [None]:
# Compute the velocity and store it in a new column

df["Velocity"] = df.n_tokens/df.Interval

In [None]:
df

Unnamed: 0,Timestamp,Speaker,Text,Interval,TextNoPunct,TextNoStopWords,n_tokens,noTokensWOStopWords,Velocity
0,0.0,Joe Rogan,"Ah, ha, ha, ha. Four, three, two, one, boom. T...",9.0,Ah ha ha ha Four three two one boom Thank you ...,Ah ha ha ha boom Thank Thanks man appreciate,19,9,2.111111
1,9.0,Elon Musk,You're welcome.,1.0,You 're welcome,welcome,3,1,3.000000
2,10.0,Joe Rogan,It's very good to meet you.,1.0,It 's very good to meet you,good meet,7,2,7.000000
3,11.0,Elon Musk,Nice to meet you too.,1.0,Nice to meet you too,Nice meet,5,2,5.000000
4,12.0,Joe Rogan,And thanks for not lighting this place on fire.,1.0,And thanks for not lighting this place on fire,thanks lighting place fire,9,4,9.000000
...,...,...,...,...,...,...,...,...,...
1826,9401.0,Joe Rogan,"I believe it's true too. So, thank you.",2.0,I believe it 's true too So thank you,believe true thank,9,3,4.500000
1827,9403.0,Elon Musk,You're welcome.,1.0,You 're welcome,welcome,3,1,3.000000
1828,9404.0,Joe Rogan,"All you assholes out there, be nice. Be nice, ...",6.0,All you assholes out there be nice Be nice bit...,assholes nice nice bitch right Thank everybody...,18,9,3.000000
1829,9410.0,Elon Musk,"All right, thank you.",1.0,All right thank you,right thank,4,2,4.000000


In [None]:
df.groupby(by='Speaker').mean()['Velocity']

Speaker
Elon Musk    2.887164
Jaime        3.630031
Joe Rogan    3.064403
Name: Velocity, dtype: float64

After all, Elon was high...

![image](https://bsmedia.business-standard.com/_media/bs/img/article/2018-09/09/full/1536463138-6668.jpg)

In [None]:
# 