# Aim:
Perform sentimental analysis on the Elon-musk tweets (Elon-musk.csv)

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
import re

In [17]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     -------------------------------------- 636.8/636.8 kB 1.5 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [2]:
df = pd.read_csv('Elon_musk.csv',encoding='latin-1')
df

Unnamed: 0.1,Unnamed: 0,Text
0,1,@kunalb11 Im an alien
1,2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,3,@joerogan @Spotify Great interview!
3,4,@gtera27 Doge is underestimated
4,5,@teslacn Congratulations Tesla China for amazi...
...,...,...
1994,1995,"@flcnhvy True, it sounds so surreal, but the n..."
1995,1996,@PPathole Make sure to read ur terms &amp; con...
1996,1997,@TeslaGong @PPathole Samwise Gamgee
1997,1998,@PPathole Altho Dumb and Dumber is <U+0001F525...


## EDA

In [20]:
df.drop(columns='Unnamed: 0', inplace=True)
df

Unnamed: 0,Text
0,@kunalb11 Im an alien
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,@joerogan @Spotify Great interview!
3,@gtera27 Doge is underestimated
4,@teslacn Congratulations Tesla China for amazi...
...,...
1994,"@flcnhvy True, it sounds so surreal, but the n..."
1995,@PPathole Make sure to read ur terms &amp; con...
1996,@TeslaGong @PPathole Samwise Gamgee
1997,@PPathole Altho Dumb and Dumber is <U+0001F525...


## Text pre-processing

In [21]:
# text normalization
txt = df['Text'].apply(lambda x:' '.join(x.lower() for x in x.split()))
txt

0                                  @kunalb11 im an alien
1       @id_aa_carmack ray tracing on cyberpunk with h...
2                     @joerogan @spotify great interview!
3                         @gtera27 doge is underestimated
4       @teslacn congratulations tesla china for amazi...
                              ...                        
1994    @flcnhvy true, it sounds so surreal, but the n...
1995    @ppathole make sure to read ur terms &amp; con...
1996                  @teslagong @ppathole samwise gamgee
1997    @ppathole altho dumb and dumber is <u+0001f525...
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [22]:
# removal of account name/ id
tweettoken = TweetTokenizer(strip_handles=True)
txt = txt.apply(lambda x:' '.join(x for x in tweettoken.tokenize(x)))
txt

0                                          i  m an alien
1       ray tracing on cyberpunk with hdr is next-leve...
2                                       great interview !
3                                  doge is underestimated
4       congratulations tesla china for amazing execut...
                              ...                        
1994    true , it sounds so surreal , but the negative...
1995    make sure to read ur terms & conditions before...
1996                                       samwise gamgee
1997    altho dumb and dumber is <u+0001f525> <u+0001f...
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [23]:
# removal of punctuations
txt = txt.str.replace('[^\w\s]','')
txt = txt.str.replace('u0001f525','')
txt

  txt = txt.str.replace('[^\w\s]','')


0                                           i  m an alien
1       ray tracing on cyberpunk with hdr is nextlevel...
2                                        great interview 
3                                  doge is underestimated
4       congratulations tesla china for amazing execut...
                              ...                        
1994    true  it sounds so surreal  but the negative p...
1995    make sure to read ur terms  conditions before ...
1996                                       samwise gamgee
1997                           altho dumb and dumber is  
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [44]:
# removal of stopwords
stop = stopwords.words('english')
txt = txt.apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
txt

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\hp/nltk_data'
    - 'C:\\Users\\hp\\New folder\\nltk_data'
    - 'C:\\Users\\hp\\New folder\\share\\nltk_data'
    - 'C:\\Users\\hp\\New folder\\lib\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [26]:
# removal of high frequency words
freq = pd.Series(' '.join(txt).split()).value_counts()[:10]
highfreq = list(freq.index)
print(freq)

the    487
to     400
is     356
a      356
of     315
in     219
for    194
it     194
we     172
s      167
dtype: int64


In [27]:
txt = txt.apply(lambda x: " ".join(x for x in x.split() if x not in highfreq))
txt

0                                            i m an alien
1       ray tracing on cyberpunk with hdr nextlevel ha...
2                                         great interview
3                                     doge underestimated
4       congratulations tesla china amazing execution ...
                              ...                        
1994    true sounds so surreal but negative propaganda...
1995    make sure read ur terms conditions before clic...
1996                                       samwise gamgee
1997                                altho dumb and dumber
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [28]:
# removal of low frequency words
freq = pd.Series(' '.join(txt).split()).value_counts()[-10:]
lowfreq = list(freq.index)
print(freq)

fever                 1
nyquil                1
elon                  1
musk                  1
negati                1
httpstco6ohta09s5l    1
carousel              1
typical               1
unusual               1
altho                 1
dtype: int64


In [29]:
txt = txt.apply(lambda x: " ".join(x for x in x.split() if x not in lowfreq))
txt

0                                            i m an alien
1       ray tracing on cyberpunk with hdr nextlevel ha...
2                                         great interview
3                                     doge underestimated
4       congratulations tesla china amazing execution ...
                              ...                        
1994    true sounds so surreal but negative propaganda...
1995    make sure read ur terms conditions before clic...
1996                                       samwise gamgee
1997                                      dumb and dumber
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [30]:
st = PorterStemmer()
txt.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                                            i m an alien
1       ray trace on cyberpunk with hdr nextlevel have...
2                                         great interview
3                                         doge underestim
4       congratul tesla china amaz execut last year no...
                              ...                        
1994    true sound so surreal but neg propaganda still...
1995     make sure read ur term condit befor click accept
1996                                         samwis gamge
1997                                      dumb and dumber
1998                             progress updat august 28
Name: Text, Length: 1999, dtype: object

In [33]:
lemma = WordNetLemmatizer()
txt = txt.apply(lambda x: " ".join([lemma.lemmatize(word) for word in x.split()]))
txt

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\hp/nltk_data'
    - 'C:\\Users\\hp\\New folder\\nltk_data'
    - 'C:\\Users\\hp\\New folder\\share\\nltk_data'
    - 'C:\\Users\\hp\\New folder\\lib\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## Feature Extaction

### 1. Using CountVectorizer

In [34]:
cv = CountVectorizer()

x = cv.fit_transform(txt)
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
print(cv.get_feature_names())

['000', '005', '10', '100', '1000', '10000', '100m', '107', '10pm', '10x', '11', '12', '122m', '13', '130', '14', '140', '149', '15', '150', '150m', '155mph', '15km', '16', '162', '17', '18', '1880s', '1b', '1m', '1st', '20', '2007', '2008', '2009', '200mph', '2012', '2017', '2020', '2021', '2024', '2026', '2030', '20k', '20km', '20m', '210', '218', '21st', '22', '225', '23', '230', '24', '240', '25', '250', '250kw', '25gunsteslacom', '25k', '25th', '28', '28th', '2k', '2m', '2nd', '2v', '30', '300', '300km', '301', '304', '304l', '30km', '30m', '31', '33', '330', '360', '37', '3800', '394', '39a', '3d', '3pm', '3rd', '40', '400', '42', '420', '4227', '430', '43rd', '445', '448', '4530', '4680', '479000', '48', '4d', '4th', '50', '50000', '50k', '60', '60000', '65', '69420', '6f', '6k', '6pm', '727', '74', '78', '7th', '90', '9007mm', '922', '948', '95', '99', 'aber', 'able', 'abo', 'aboard', 'abort', 'absence', 'absolute', 'absolutely', 'absorb', 'absorption', 'absurd', 'absurdly', 'a



In [35]:
print(cv.vocabulary_)

{'an': 254, 'alien': 220, 'ray': 3397, 'tracing': 4184, 'on': 3026, 'cyberpunk': 891, 'with': 4585, 'hdr': 1594, 'nextlevel': 2950, 'have': 1588, 'you': 4632, 'tried': 4205, 'great': 1526, 'interview': 2473, 'doge': 1035, 'underestimated': 4362, 'congratulations': 785, 'tesla': 4070, 'china': 680, 'amazing': 246, 'execution': 1226, 'last': 2571, 'year': 4622, 'now': 2981, 'next': 2949, 'even': 1199, 'more': 2878, 'happy': 1571, 'new': 2947, 'ox': 3090, 'httpstco9wfkmyu2oj': 1789, 'frodo': 1409, 'was': 4500, 'underdoge': 4361, 'all': 223, 'thought': 4116, 'he': 1595, 'would': 4606, 'fail': 1264, 'himself': 1645, 'most': 2882, 'httpstcozgxjfdzzrm': 2322, 'haha': 1554, 'thanks': 4085, 'indeed': 2408, 'tweets': 4244, 'definitely': 934, 'do': 1027, 'not': 2973, 'represent': 3474, 'realworld': 3416, 'time': 4134, 'allocation': 225, 'entertaining': 1168, 'outcome': 3067, 'likely': 2638, 'just': 2522, 'sent': 3653, 'some': 3805, 'agree': 201, 'clubhouse': 713, 'httpstco3rwe9uhsts': 1728, 'gett

### 2. Using TF-IDF

In [36]:
tf = TfidfVectorizer()

y = tf.fit_transform(txt)
y.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
print(tf.get_feature_names())

['000', '005', '10', '100', '1000', '10000', '100m', '107', '10pm', '10x', '11', '12', '122m', '13', '130', '14', '140', '149', '15', '150', '150m', '155mph', '15km', '16', '162', '17', '18', '1880s', '1b', '1m', '1st', '20', '2007', '2008', '2009', '200mph', '2012', '2017', '2020', '2021', '2024', '2026', '2030', '20k', '20km', '20m', '210', '218', '21st', '22', '225', '23', '230', '24', '240', '25', '250', '250kw', '25gunsteslacom', '25k', '25th', '28', '28th', '29', '2k', '2m', '2nd', '2v', '30', '300', '300km', '301', '304', '304l', '30km', '30m', '31', '33', '330', '360', '37', '3800', '394', '39a', '3d', '3pm', '3rd', '40', '400', '42', '420', '4227', '430', '43rd', '445', '448', '4530', '4680', '479000', '48', '4d', '4th', '50', '50000', '50k', '60', '60000', '65', '69420', '6f', '6k', '6pm', '727', '74', '78', '7th', '90', '9007mm', '922', '948', '95', '99', 'aber', 'able', 'abo', 'aboard', 'abort', 'about', 'above', 'absence', 'absolute', 'absolutely', 'absorb', 'absorption', 



In [38]:
print(tf.vocabulary_)

{'an': 254, 'alien': 220, 'ray': 3397, 'tracing': 4184, 'on': 3026, 'cyberpunk': 891, 'with': 4585, 'hdr': 1594, 'nextlevel': 2950, 'have': 1588, 'you': 4632, 'tried': 4205, 'great': 1526, 'interview': 2473, 'doge': 1035, 'underestimated': 4362, 'congratulations': 785, 'tesla': 4070, 'china': 680, 'amazing': 246, 'execution': 1226, 'last': 2571, 'year': 4622, 'now': 2981, 'next': 2949, 'even': 1199, 'more': 2878, 'happy': 1571, 'new': 2947, 'ox': 3090, 'httpstco9wfkmyu2oj': 1789, 'frodo': 1409, 'was': 4500, 'underdoge': 4361, 'all': 223, 'thought': 4116, 'he': 1595, 'would': 4606, 'fail': 1264, 'himself': 1645, 'most': 2882, 'httpstcozgxjfdzzrm': 2322, 'haha': 1554, 'thanks': 4085, 'indeed': 2408, 'tweets': 4244, 'definitely': 934, 'do': 1027, 'not': 2973, 'represent': 3474, 'realworld': 3416, 'time': 4134, 'allocation': 225, 'entertaining': 1168, 'outcome': 3067, 'likely': 2638, 'just': 2522, 'sent': 3653, 'some': 3805, 'agree': 201, 'clubhouse': 713, 'httpstco3rwe9uhsts': 1728, 'gett

In [39]:
data = pd.DataFrame(y.toarray(), columns=tf.get_feature_names_out())
data

Unnamed: 0,000,005,10,100,1000,10000,100m,107,10pm,10x,...,yup,zealand,zenit,zero,zeroes,zeros,zip,zon,zone,ðogecoin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. N-gram

In [40]:
TextBlob(txt[1]).ngrams(2)


**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\hp/nltk_data'
    - 'C:\\Users\\hp\\New folder\\nltk_data'
    - 'C:\\Users\\hp\\New folder\\share\\nltk_data'
    - 'C:\\Users\\hp\\New folder\\lib\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************



MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.


### 4. Sentiment Analysis

In [41]:
txt.apply(lambda x: TextBlob(x).sentiment)

0                                  (-0.25, 0.75)
1                                     (0.0, 0.0)
2                                    (0.8, 0.75)
3                                     (0.0, 0.0)
4                    (0.275, 0.3666666666666667)
                          ...                   
1994    (0.1111111111111111, 0.5083333333333334)
1995                   (0.5, 0.8888888888888888)
1996                                  (0.0, 0.0)
1997                               (-0.375, 0.5)
1998                                  (0.0, 0.0)
Name: Text, Length: 1999, dtype: object

In [42]:
data = pd.DataFrame(txt, columns=['Text'])
data

Unnamed: 0,Text
0,i m an alien
1,ray tracing on cyberpunk with hdr nextlevel ha...
2,great interview
3,doge underestimated
4,congratulations tesla china amazing execution ...
...,...
1994,true sounds so surreal but negative propaganda...
1995,make sure read ur terms conditions before clic...
1996,samwise gamgee
1997,dumb and dumber


In [43]:
data['sentiment- Polarity'] = data['Text'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['sentiment- Subjectivity'] = data['Text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
data

Unnamed: 0,Text,sentiment- Polarity,sentiment- Subjectivity
0,i m an alien,-0.250000,0.750000
1,ray tracing on cyberpunk with hdr nextlevel ha...,0.000000,0.000000
2,great interview,0.800000,0.750000
3,doge underestimated,0.000000,0.000000
4,congratulations tesla china amazing execution ...,0.275000,0.366667
...,...,...,...
1994,true sounds so surreal but negative propaganda...,0.111111,0.508333
1995,make sure read ur terms conditions before clic...,0.500000,0.888889
1996,samwise gamgee,0.000000,0.000000
1997,dumb and dumber,-0.375000,0.500000
