<a href="https://colab.research.google.com/github/Shivesh21122000/SentimentAnalysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 4.2MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vs = SentimentIntensityAnalyzer()

In [4]:
text="The book is really fantastic"
vs.polarity_scores(text)

{'compound': 0.5984, 'neg': 0.0, 'neu': 0.507, 'pos': 0.493}

#Now we are going to start web scrapping.... we will take data from "Inshorts" website. 

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [7]:
urls=['https://inshorts.com/en/read/sports',
      'https://inshorts.com/en/read/world',
      'https://inshorts.com/en/read/politics']
def build_dataset(urls):
  news_data = []
  for url in urls:
    news_category = url.split('/')[-1]
    data=requests.get(url)
    soup = BeautifulSoup(data.content)

    news_articles = [{'news_headline':headline.find('span',attrs={"itemprop":"headline"}).string,
                      'news_article':article.find('div',attrs={"itemprop":"articleBody"}).string,
                      'news_category':news_category}
                    
                     for headline,article in zip(soup.find_all('div',class_=["news-card-title news-right-box"]),
                                                soup.find_all('div',class_=["news-card-content news-right-box"]))
                     ]
    news_articles = news_articles[0:20]
    news_data.extend(news_articles)

  df=pd.DataFrame(news_data)
  df=df[['news_headline','news_article','news_category']]
  return df

In [8]:
df = build_dataset(urls)
df.head()

Unnamed: 0,news_headline,news_article,news_category
0,Concussion substitute Chahal takes 3 wickets a...,India defeated Australia by 11 runs in the fir...,sports
1,Australia question if Chahal was 'like-for-lik...,Australia have questioned whether Yuzvendra Ch...,sports
2,Langer seemingly objects to Jadeja's concussio...,Australia head coach Justin Langer seemingly o...,sports
3,Looks like your listening is partial: Kartik t...,A fan took to Twitter to accuse Murali Kartik ...,sports
4,Jadeja ruled out of remaining T20Is against Au...,Team India all-rounder Ravindra Jadeja has bee...,sports


In [9]:
 import nltk
 nltk.download('stopwords')
 stopword_list=nltk.corpus.stopwords.words('english')
 stopword_list.remove('not')
 stopword_list.remove('no')
 len(stopword_list)
 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


177

In [10]:
# Let remove html tags
def html_tag(text):
  soup = BeautifulSoup(text,'html.parser')
  new_text = soup.get_text()
  return new_text
html_tag('<html><h1>hello how are you?</h1></html>')  

'hello how are you?'

In [11]:
# next step is to expand contraction
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 4.0MB/s 
[?25hCollecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 7.3MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  C

In [12]:
import contractions
def contraction(text):
  expand = contractions.fix(text)
  return expand
contraction("I'll do it, I'd think about this")

'I will do it, I would think about this'

In [13]:
# removing special characters
import re
def remove_sp(text):
  pattern = r'[^A-Za-z0-9\s]'
  text = re.sub(pattern,'',text)
  return text

remove_sp("+#hello")

'hello'

In [14]:
#removing stopwords
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()

In [15]:
def remove_stopwords(text):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  filtered_tokens = [token for token in tokens if token not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text


In [16]:
remove_stopwords("we are only ones left alive")

'ones left alive'

In [17]:
#finally every preprocessing we have done.
#1)Lower case...2)HTML tag...3)Contractions...4)Special Characters..5)Stop words

df.news_headline = df.news_headline.apply(lambda x:x.lower())
df.news_article = df.news_article.apply(lambda x:x.lower())

df.news_headline = df.news_headline.apply(html_tag)
df.news_article = df.news_article.apply(html_tag)

df.news_headline = df.news_headline.apply(contraction)
df.news_article = df.news_article.apply(contraction)

df.news_headline = df.news_headline.apply(remove_sp)
df.news_article = df.news_article.apply(remove_sp)

df.news_headline = df.news_headline.apply(remove_stopwords)
df.news_article = df.news_article.apply(remove_stopwords)

df.head()

Unnamed: 0,news_headline,news_article,news_category
0,concussion substitute chahal takes 3 wickets i...,india defeated australia 11 runs first t20i ca...,sports
1,australia question chahal likeforlike concussi...,australia questioned whether yuzvendra chahal ...,sports
2,langer seemingly objects jadejas concussion su...,australia head coach justin langer seemingly o...,sports
3,looks like listening partial kartik fan critic...,fan took twitter accuse murali kartik lack res...,sports
4,jadeja ruled remaining t20is australia due con...,team india allrounder ravindra jadeja ruled re...,sports


In [23]:
df['compound'] = df['news_article'].apply(lambda x: vs.polarity_scores(x)['compound'])

In [24]:
df.head()

Unnamed: 0,news_headline,news_article,news_category,compound
0,concussion substitute chahal takes 3 wickets i...,india defeated australia 11 runs first t20i ca...,sports,-0.5707
1,australia question chahal likeforlike concussi...,australia questioned whether yuzvendra chahal ...,sports,-0.1027
2,langer seemingly objects jadejas concussion su...,australia head coach justin langer seemingly o...,sports,0.4215
3,looks like listening partial kartik fan critic...,fan took twitter accuse murali kartik lack res...,sports,0.7096
4,jadeja ruled remaining t20is australia due con...,team india allrounder ravindra jadeja ruled re...,sports,0.0


In [32]:
df['sentiment'] = df['compound'].apply(lambda x: 'Positive' if(x>0.05) else ('Neutral' if(x>-0.5 and x<0.05) else 'Negative'))

In [33]:
df.head()

Unnamed: 0,news_headline,news_article,news_category,compound,sentiment
0,concussion substitute chahal takes 3 wickets i...,india defeated australia 11 runs first t20i ca...,sports,-0.5707,Negative
1,australia question chahal likeforlike concussi...,australia questioned whether yuzvendra chahal ...,sports,-0.1027,Neutral
2,langer seemingly objects jadejas concussion su...,australia head coach justin langer seemingly o...,sports,0.4215,Positive
3,looks like listening partial kartik fan critic...,fan took twitter accuse murali kartik lack res...,sports,0.7096,Positive
4,jadeja ruled remaining t20is australia due con...,team india allrounder ravindra jadeja ruled re...,sports,0.0,Neutral


# In this project we have done -
# Web Scrapping
# Data Preproccessing
## Analysis and Prediction
