In [177]:
!pip install -U textblob



In [178]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [179]:
!pip install vaderSentiment



In [180]:
import pandas as pd
import string
import re
import nltk
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [181]:
data = pd.read_csv("Headlines_5000.csv", usecols = ['headline'])
data.head()

Unnamed: 0,headline
0,Russia-Ukraine war LIVE updates: NATO leaders ...
1,CNBC-TV18 Classroom: What should be your optio...
2,Ukraine-Russia conflict: From sunflower oil to...
3,IOC to be dropped from Nifty 50 from March 31
4,CBI says NSE Himalayan yogi none other than An...


In [182]:
data.head()

Unnamed: 0,headline
0,Russia-Ukraine war LIVE updates: NATO leaders ...
1,CNBC-TV18 Classroom: What should be your optio...
2,Ukraine-Russia conflict: From sunflower oil to...
3,IOC to be dropped from Nifty 50 from March 31
4,CBI says NSE Himalayan yogi none other than An...


In [183]:
#library that contains punctuation
string.punctuation


#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
data['clean_msg']= data['headline'].apply(lambda x:remove_punctuation(x))
data.head()

Unnamed: 0,headline,clean_msg
0,Russia-Ukraine war LIVE updates: NATO leaders ...,RussiaUkraine war LIVE updates NATO leaders me...
1,CNBC-TV18 Classroom: What should be your optio...,CNBCTV18 Classroom What should be your options...
2,Ukraine-Russia conflict: From sunflower oil to...,UkraineRussia conflict From sunflower oil to b...
3,IOC to be dropped from Nifty 50 from March 31,IOC to be dropped from Nifty 50 from March 31
4,CBI says NSE Himalayan yogi none other than An...,CBI says NSE Himalayan yogi none other than An...


In [184]:
data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())

In [185]:
#defining function for tokenization

def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
data['msg_tokenied']= data['msg_lower'].apply(lambda x: tokenization(x))

In [186]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
data['no_stopwords']= data['msg_tokenied'].apply(lambda x:remove_stopwords(x))

In [187]:
#defining the object for stemming
porter_stemmer = PorterStemmer()
#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
data['msg_stemmed']=data['no_stopwords'].apply(lambda x: stemming(x))

In [188]:
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
data['msg_lemmatized']=data['no_stopwords'].apply(lambda x:lemmatizer(x))

In [189]:
data['msg_lemmatized'][1][0]

'cnbctv18 classroom what should be your options trading exit strategy heres an explainer'

In [190]:
sid_obj= SentimentIntensityAnalyzer()

In [191]:
data['sentiment'] = ""

In [192]:
data = data.drop(['clean_msg',	'msg_lower',	'msg_tokenied',	'no_stopwords'], axis = 1)

In [193]:
data.head()

Unnamed: 0,headline,msg_stemmed,msg_lemmatized,sentiment
0,Russia-Ukraine war LIVE updates: NATO leaders ...,[russiaukraine war live updates nato leaders m...,[russiaukraine war live updates nato leaders m...,
1,CNBC-TV18 Classroom: What should be your optio...,[cnbctv18 classroom what should be your option...,[cnbctv18 classroom what should be your option...,
2,Ukraine-Russia conflict: From sunflower oil to...,[ukrainerussia conflict from sunflower oil to ...,[ukrainerussia conflict from sunflower oil to ...,
3,IOC to be dropped from Nifty 50 from March 31,[ioc to be dropped from nifty 50 from march 31],[ioc to be dropped from nifty 50 from march 31],
4,CBI says NSE Himalayan yogi none other than An...,[cbi says nse himalayan yogi none other than a...,[cbi says nse himalayan yogi none other than a...,


In [194]:
x = list(data.shape)
b = []
for i in range(x[0]):
  # print(data['headline'][i])
  d = sid_obj.polarity_scores(data['msg_lemmatized'][i][0])
  new_value = max(d, key=d.get)
  data['sentiment'] = new_value
  b.append(d.get('compound'))

b = pd.DataFrame(b)
b.columns = ['score']
print(b.head())
result = pd.concat([data, b], axis=1, join='inner')
result.head(5)

    score
0 -0.3612
1  0.0000
2 -0.3182
3  0.4019
4  0.0000


Unnamed: 0,headline,msg_stemmed,msg_lemmatized,sentiment,score
0,Russia-Ukraine war LIVE updates: NATO leaders ...,[russiaukraine war live updates nato leaders m...,[russiaukraine war live updates nato leaders m...,neu,-0.3612
1,CNBC-TV18 Classroom: What should be your optio...,[cnbctv18 classroom what should be your option...,[cnbctv18 classroom what should be your option...,neu,0.0
2,Ukraine-Russia conflict: From sunflower oil to...,[ukrainerussia conflict from sunflower oil to ...,[ukrainerussia conflict from sunflower oil to ...,neu,-0.3182
3,IOC to be dropped from Nifty 50 from March 31,[ioc to be dropped from nifty 50 from march 31],[ioc to be dropped from nifty 50 from march 31],neu,0.4019
4,CBI says NSE Himalayan yogi none other than An...,[cbi says nse himalayan yogi none other than a...,[cbi says nse himalayan yogi none other than a...,neu,0.0
