## 1) Perform sentimental analysis on the Elon-musk tweets (Elon-musk.csv)

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
import re

In [2]:
df = pd.read_csv(r"C:\Users\rohit\Downloads\Assignments\data science assignment\15-Text Mining\Elon_musk.csv", 
                 encoding='latin-1')
df

Unnamed: 0.1,Unnamed: 0,Text
0,1,@kunalb11 Im an alien
1,2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,3,@joerogan @Spotify Great interview!
3,4,@gtera27 Doge is underestimated
4,5,@teslacn Congratulations Tesla China for amazi...
...,...,...
1994,1995,"@flcnhvy True, it sounds so surreal, but the n..."
1995,1996,@PPathole Make sure to read ur terms &amp; con...
1996,1997,@TeslaGong @PPathole Samwise Gamgee
1997,1998,@PPathole Altho Dumb and Dumber is <U+0001F525...


## EDA

In [3]:
df.drop(columns='Unnamed: 0', inplace=True)
df

Unnamed: 0,Text
0,@kunalb11 Im an alien
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,@joerogan @Spotify Great interview!
3,@gtera27 Doge is underestimated
4,@teslacn Congratulations Tesla China for amazi...
...,...
1994,"@flcnhvy True, it sounds so surreal, but the n..."
1995,@PPathole Make sure to read ur terms &amp; con...
1996,@TeslaGong @PPathole Samwise Gamgee
1997,@PPathole Altho Dumb and Dumber is <U+0001F525...


## Text pre-processing

In [4]:
# text normalization
txt = df['Text'].apply(lambda x:' '.join(x.lower() for x in x.split()))
txt

0                                  @kunalb11 im an alien
1       @id_aa_carmack ray tracing on cyberpunk with h...
2                     @joerogan @spotify great interview!
3                         @gtera27 doge is underestimated
4       @teslacn congratulations tesla china for amazi...
                              ...                        
1994    @flcnhvy true, it sounds so surreal, but the n...
1995    @ppathole make sure to read ur terms &amp; con...
1996                  @teslagong @ppathole samwise gamgee
1997    @ppathole altho dumb and dumber is <u+0001f525...
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [5]:
# removal of account name/ id
tweettoken = TweetTokenizer(strip_handles=True)
txt = txt.apply(lambda x:' '.join(x for x in tweettoken.tokenize(x)))
txt

0                                          i  m an alien
1       ray tracing on cyberpunk with hdr is next-leve...
2                                       great interview !
3                                  doge is underestimated
4       congratulations tesla china for amazing execut...
                              ...                        
1994    true , it sounds so surreal , but the negative...
1995    make sure to read ur terms & conditions before...
1996                                       samwise gamgee
1997    altho dumb and dumber is <u+0001f525> <u+0001f...
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [6]:
# removal of punctuations
txt = txt.str.replace('[^\w\s]','')
txt = txt.str.replace('u0001f525','')
txt

  txt = txt.str.replace('[^\w\s]','')


0                                           i  m an alien
1       ray tracing on cyberpunk with hdr is nextlevel...
2                                        great interview 
3                                  doge is underestimated
4       congratulations tesla china for amazing execut...
                              ...                        
1994    true  it sounds so surreal  but the negative p...
1995    make sure to read ur terms  conditions before ...
1996                                       samwise gamgee
1997                           altho dumb and dumber is  
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [7]:
# removal of stopwords
stop = stopwords.words('english')
txt = txt.apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
txt

0                                                   alien
1               ray tracing cyberpunk hdr nextlevel tried
2                                         great interview
3                                     doge underestimated
4       congratulations tesla china amazing execution ...
                              ...                        
1994    true sounds surreal negative propaganda still ...
1995    make sure read ur terms conditions clicking ac...
1996                                       samwise gamgee
1997                                    altho dumb dumber
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [8]:
# removal of high frequency words
freq = pd.Series(' '.join(txt).split()).value_counts()[:10]
highfreq = list(freq.index)
print(freq)

rt           127
tesla         96
yes           86
u0001f923     77
great         76
haha          55
good          51
launch        49
sure          43
would         41
dtype: int64


In [9]:
txt = txt.apply(lambda x: " ".join(x for x in x.split() if x not in highfreq))
txt

0                                                   alien
1               ray tracing cyberpunk hdr nextlevel tried
2                                               interview
3                                     doge underestimated
4       congratulations china amazing execution last y...
                              ...                        
1994    true sounds surreal negative propaganda still ...
1995        make read ur terms conditions clicking accept
1996                                       samwise gamgee
1997                                    altho dumb dumber
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [10]:
# removal of low frequency words
freq = pd.Series(' '.join(txt).split()).value_counts()[-10:]
lowfreq = list(freq.index)
print(freq)

bandwidth             1
noticeably            1
interventions         1
httpstcos5xouoedjt    1
steadily              1
29                    1
httpstcob0lygotobi    1
fewer                 1
httpstcou90sqjkn8u    1
altho                 1
dtype: int64


In [11]:
txt = txt.apply(lambda x: " ".join(x for x in x.split() if x not in lowfreq))
txt

0                                                   alien
1               ray tracing cyberpunk hdr nextlevel tried
2                                               interview
3                                     doge underestimated
4       congratulations china amazing execution last y...
                              ...                        
1994    true sounds surreal negative propaganda still ...
1995        make read ur terms conditions clicking accept
1996                                       samwise gamgee
1997                                          dumb dumber
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

In [12]:
st = PorterStemmer()
txt.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                                                   alien
1                   ray trace cyberpunk hdr nextlevel tri
2                                               interview
3                                         doge underestim
4         congratul china amaz execut last year next even
                              ...                        
1994    true sound surreal neg propaganda still easi f...
1995                make read ur term condit click accept
1996                                         samwis gamge
1997                                          dumb dumber
1998                             progress updat august 28
Name: Text, Length: 1999, dtype: object

In [13]:
lemma = WordNetLemmatizer()
txt = txt.apply(lambda x: " ".join([lemma.lemmatize(word) for word in x.split()]))
txt

0                                                   alien
1               ray tracing cyberpunk hdr nextlevel tried
2                                               interview
3                                     doge underestimated
4       congratulation china amazing execution last ye...
                              ...                        
1994    true sound surreal negative propaganda still e...
1995          make read ur term condition clicking accept
1996                                       samwise gamgee
1997                                          dumb dumber
1998                            progress update august 28
Name: Text, Length: 1999, dtype: object

## Feature Extaction

### 1. Using CountVectorizer

In [14]:
cv = CountVectorizer()

x = cv.fit_transform(txt)
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
print(cv.get_feature_names())

['000', '005', '10', '100', '1000', '10000', '100m', '107', '10pm', '10x', '11', '12', '122m', '13', '130', '14', '140', '149', '15', '150', '150m', '155mph', '15km', '16', '162', '17', '18', '1880s', '1b', '1m', '1st', '20', '2007', '2008', '2009', '200mph', '2012', '2017', '2020', '2021', '2024', '2026', '2030', '20k', '20km', '20m', '210', '218', '21st', '22', '225', '23', '230', '24', '240', '25', '250', '250kw', '25gunsteslacom', '25k', '25th', '28', '28th', '2k', '2m', '2nd', '2v', '30', '300', '300km', '301', '304', '304l', '30km', '30m', '31', '33', '330', '360', '37', '3800', '394', '39a', '3d', '3pm', '3rd', '40', '400', '42', '420', '4227', '430', '43rd', '445', '448', '4530', '4680', '479000', '48', '4d', '4th', '50', '50000', '50k', '60', '60000', '65', '69420', '6f', '6k', '6pm', '727', '74', '78', '7th', '90', '9007mm', '922', '948', '95', '99', 'aber', 'able', 'abo', 'aboard', 'abort', 'absence', 'absolute', 'absolutely', 'absorb', 'absorption', 'absurd', 'absurdly', 'a



In [16]:
print(cv.vocabulary_)

{'alien': 212, 'ray': 3132, 'tracing': 3832, 'cyberpunk': 820, 'hdr': 1459, 'nextlevel': 2734, 'tried': 3853, 'interview': 2312, 'doge': 948, 'underestimated': 4003, 'congratulation': 718, 'china': 621, 'amazing': 235, 'execution': 1125, 'last': 2397, 'year': 4229, 'next': 2733, 'even': 1100, 'happy': 1441, 'new': 2731, 'ox': 2853, 'httpstco9wfkmyu2oj': 1640, 'frodo': 1297, 'underdoge': 4002, 'thought': 3774, 'fail': 1163, 'httpstcozgxjfdzzrm': 2170, 'thanks': 3756, 'indeed': 2251, 'tweet': 3885, 'definitely': 859, 'represent': 3206, 'realworld': 3149, 'time': 3790, 'allocation': 215, 'entertaining': 1070, 'outcome': 2833, 'likely': 2457, 'sent': 3365, 'agree': 195, 'clubhouse': 653, 'httpstco3rwe9uhsts': 1578, 'getting': 1358, 'real': 3144, 'bought': 483, 'dogecoin': 950, 'lil': 2458, 'toddler': 3800, 'hodler': 1508, 'issue': 2331, 'sentencing': 3366, 'seems': 3351, 'bit': 443, 'high': 1496, 'fixing': 1237, 'please': 2953, 'unlock': 4029, 'account': 150, 'true': 3861, 'power': 2988, '

### 2. Using TF-IDF

In [17]:
tf = TfidfVectorizer()

y = tf.fit_transform(txt)
y.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
print(tf.get_feature_names())

['000', '005', '10', '100', '1000', '10000', '100m', '107', '10pm', '10x', '11', '12', '122m', '13', '130', '14', '140', '149', '15', '150', '150m', '155mph', '15km', '16', '162', '17', '18', '1880s', '1b', '1m', '1st', '20', '2007', '2008', '2009', '200mph', '2012', '2017', '2020', '2021', '2024', '2026', '2030', '20k', '20km', '20m', '210', '218', '21st', '22', '225', '23', '230', '24', '240', '25', '250', '250kw', '25gunsteslacom', '25k', '25th', '28', '28th', '2k', '2m', '2nd', '2v', '30', '300', '300km', '301', '304', '304l', '30km', '30m', '31', '33', '330', '360', '37', '3800', '394', '39a', '3d', '3pm', '3rd', '40', '400', '42', '420', '4227', '430', '43rd', '445', '448', '4530', '4680', '479000', '48', '4d', '4th', '50', '50000', '50k', '60', '60000', '65', '69420', '6f', '6k', '6pm', '727', '74', '78', '7th', '90', '9007mm', '922', '948', '95', '99', 'aber', 'able', 'abo', 'aboard', 'abort', 'absence', 'absolute', 'absolutely', 'absorb', 'absorption', 'absurd', 'absurdly', 'a

In [24]:
print(tf.vocabulary_)

{'alien': 212, 'ray': 3132, 'tracing': 3832, 'cyberpunk': 820, 'hdr': 1459, 'nextlevel': 2734, 'tried': 3853, 'interview': 2312, 'doge': 948, 'underestimated': 4003, 'congratulation': 718, 'china': 621, 'amazing': 235, 'execution': 1125, 'last': 2397, 'year': 4229, 'next': 2733, 'even': 1100, 'happy': 1441, 'new': 2731, 'ox': 2853, 'httpstco9wfkmyu2oj': 1640, 'frodo': 1297, 'underdoge': 4002, 'thought': 3774, 'fail': 1163, 'httpstcozgxjfdzzrm': 2170, 'thanks': 3756, 'indeed': 2251, 'tweet': 3885, 'definitely': 859, 'represent': 3206, 'realworld': 3149, 'time': 3790, 'allocation': 215, 'entertaining': 1070, 'outcome': 2833, 'likely': 2457, 'sent': 3365, 'agree': 195, 'clubhouse': 653, 'httpstco3rwe9uhsts': 1578, 'getting': 1358, 'real': 3144, 'bought': 483, 'dogecoin': 950, 'lil': 2458, 'toddler': 3800, 'hodler': 1508, 'issue': 2331, 'sentencing': 3366, 'seems': 3351, 'bit': 443, 'high': 1496, 'fixing': 1237, 'please': 2953, 'unlock': 4029, 'account': 150, 'true': 3861, 'power': 2988, '

In [19]:
data = pd.DataFrame(y.toarray(), columns=tf.get_feature_names_out())
data

Unnamed: 0,000,005,10,100,1000,10000,100m,107,10pm,10x,...,york,young,yup,zealand,zenit,zero,zip,zon,zone,ðogecoin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. N-gram

In [25]:
TextBlob(txt[1]).ngrams(2)

[WordList(['ray', 'tracing']),
 WordList(['tracing', 'cyberpunk']),
 WordList(['cyberpunk', 'hdr']),
 WordList(['hdr', 'nextlevel']),
 WordList(['nextlevel', 'tried'])]

### 4. Sentiment Analysis

In [21]:
txt.apply(lambda x: TextBlob(x).sentiment)

0                                    (-0.25, 0.75)
1                                       (0.0, 0.0)
2                                       (0.0, 0.0)
3                                       (0.0, 0.0)
4       (0.20000000000000004, 0.32222222222222224)
                           ...                    
1994     (0.15238095238095237, 0.4928571428571429)
1995                                    (0.0, 0.0)
1996                                    (0.0, 0.0)
1997                                 (-0.375, 0.5)
1998                                    (0.0, 0.0)
Name: Text, Length: 1999, dtype: object

In [22]:
data = pd.DataFrame(txt, columns=['Text'])
data

Unnamed: 0,Text
0,alien
1,ray tracing cyberpunk hdr nextlevel tried
2,interview
3,doge underestimated
4,congratulation china amazing execution last ye...
...,...
1994,true sound surreal negative propaganda still e...
1995,make read ur term condition clicking accept
1996,samwise gamgee
1997,dumb dumber


In [23]:
data['sentiment- Polarity'] = data['Text'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['sentiment- Subjectivity'] = data['Text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
data

Unnamed: 0,Text,sentiment- Polarity,sentiment- Subjectivity
0,alien,-0.250000,0.750000
1,ray tracing cyberpunk hdr nextlevel tried,0.000000,0.000000
2,interview,0.000000,0.000000
3,doge underestimated,0.000000,0.000000
4,congratulation china amazing execution last ye...,0.200000,0.322222
...,...,...,...
1994,true sound surreal negative propaganda still e...,0.152381,0.492857
1995,make read ur term condition clicking accept,0.000000,0.000000
1996,samwise gamgee,0.000000,0.000000
1997,dumb dumber,-0.375000,0.500000
