In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from textblob import TextBlob

In [11]:
data = pd.read_csv("Elon_musk.csv",encoding='latin-1')


### 2. Number of Words


In [12]:
data.head(5)


Unnamed: 0.1,Unnamed: 0,Text
0,1,@kunalb11 Im an alien
1,2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,3,@joerogan @Spotify Great interview!
3,4,@gtera27 Doge is underestimated
4,5,@teslacn Congratulations Tesla China for amazi...


In [13]:
len(data.index)

1999

In [14]:
data['word_count'] = data['Text'].apply(lambda x: len(str(x).split(" ")))
data[['Text','word_count']].head()

Unnamed: 0,Text,word_count
0,@kunalb11 Im an alien,4
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,13
2,@joerogan @Spotify Great interview!,4
3,@gtera27 Doge is underestimated,4
4,@teslacn Congratulations Tesla China for amazi...,17



### 3. Number of Characters


In [16]:
data['char_count'] = data['Text'].str.len() ## this also includes spaces
data[['Text','char_count']].head()

Unnamed: 0,Text,char_count
0,@kunalb11 Im an alien,22
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,82
2,@joerogan @Spotify Great interview!,35
3,@gtera27 Doge is underestimated,31
4,@teslacn Congratulations Tesla China for amazi...,104


In [17]:
data

Unnamed: 0.1,Unnamed: 0,Text,word_count,char_count
0,1,@kunalb11 Im an alien,4,22
1,2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,13,82
2,3,@joerogan @Spotify Great interview!,4,35
3,4,@gtera27 Doge is underestimated,4,31
4,5,@teslacn Congratulations Tesla China for amazi...,17,104
...,...,...,...,...
1994,1995,"@flcnhvy True, it sounds so surreal, but the n...",23,144
1995,1996,@PPathole Make sure to read ur terms &amp; con...,12,77
1996,1997,@TeslaGong @PPathole Samwise Gamgee,4,35
1997,1998,@PPathole Altho Dumb and Dumber is <U+0001F525...,7,59



### 4. Average Word Length


In [18]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['Text'].apply(lambda x: avg_word(x))
data[['Text','avg_word']].head()

Unnamed: 0,Text,avg_word
0,@kunalb11 Im an alien,4.75
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,5.384615
2,@joerogan @Spotify Great interview!,8.0
3,@gtera27 Doge is underestimated,7.0
4,@teslacn Congratulations Tesla China for amazi...,5.176471



### 5. Number of stopwords


In [19]:
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')

data['stopwords'] = data['Text'].apply(lambda x: len([x for x in x.split() if x in stop]))
data[['Text','stopwords']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,stopwords
0,@kunalb11 Im an alien,1
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,4
2,@joerogan @Spotify Great interview!,0
3,@gtera27 Doge is underestimated,1
4,@teslacn Congratulations Tesla China for amazi...,5



### 6. Number of Special Characters


In [20]:
data['hastags'] = data['Text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
data[['Text','hastags']].head()

Unnamed: 0,Text,hastags
0,@kunalb11 Im an alien,1
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,1
2,@joerogan @Spotify Great interview!,2
3,@gtera27 Doge is underestimated,1
4,@teslacn Congratulations Tesla China for amazi...,1


### 7. Number of Numerics


In [22]:

data['numerics'] = data['Text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[['Text','numerics']].head()

Unnamed: 0,Text,numerics
0,@kunalb11 Im an alien,0
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,0
2,@joerogan @Spotify Great interview!,0
3,@gtera27 Doge is underestimated,0
4,@teslacn Congratulations Tesla China for amazi...,0



### 8. Number of Upper Case Words


In [23]:
data['upper'] = data['Text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data[['Text','upper']].head()

Unnamed: 0,Text,upper
0,@kunalb11 Im an alien,0
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,1
2,@joerogan @Spotify Great interview!,0
3,@gtera27 Doge is underestimated,0
4,@teslacn Congratulations Tesla China for amazi...,0


### **Pre** - **Processing** 

### 2.1 Lower Case

In [24]:
data['Text'] = data['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data['Text'].head()

0                               @kunalb11 im an alien
1    @id_aa_carmack ray tracing on cyberpunk with h...
2                  @joerogan @spotify great interview!
3                      @gtera27 doge is underestimated
4    @teslacn congratulations tesla china for amazi...
Name: Text, dtype: object