## Unsupervised Learning and predicting Sentiment Analyses for News Headlines

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

Scrapping the Text from inshort website

In [4]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [7]:
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,"Uber to fire 3,700 employees worldwide, CEO no...",Ride-hailing company Uber on Wednesday said it...,technology
1,"Airbnb fires 1,900 employees making 25% of its...","Airbnb, the US-based startup that connects tra...",technology
2,Grimes explains the name of her and Elon Musk'...,A day after Tesla CEO Elon Musk said his baby ...,technology
3,Aarogya Setu app won't be needed after COVID-1...,MyGov's CEO Abhishek Singh in an interview wit...,technology
4,Wishing good vibes for all in second half 2020...,"Tesla's billionaire CEO Elon Musk, who recentl...",technology
5,French hacker says Aarogya Setu putting 'priva...,French hacker Robert Baptiste on Tuesday alert...,technology
6,India's Sudhir Krishnaswamy in FB's oversight ...,Facebook has introduced the first 20 members o...,technology
7,It's my fault: Samsung heir apologises over co...,Samsung Group heir Jay Y Lee has apologised fo...,technology
8,Billionaire Elon Musk qualifies for $706 milli...,Tesla CEO Elon Musk has qualified for stock op...,technology
9,"Apple, Google show sample of their COVID-19 ex...",Apple and Google have published sample user in...,technology


In [4]:
headline=news_df['news_headline']

In [5]:
headline

0     Airbnb fires 1,900 employees making 25% of its...
1     Grimes explains the name of her and Elon Musk'...
2     Indians evacuated from abroad will have to dow...
3     Amazon VP who quit over staff firings says Goo...
4     Wishing good vibes for all in second half 2020...
                            ...                        
70    Skies in Niger's capital turn red during sand ...
71    Hong Kong will never be calm unless violent pr...
72    Trump denies US role in 'mercenary incursion' ...
73    Trump made 'stupid mistake' by exiting from nu...
74    3 rockets hit near Baghdad International Airpo...
Name: news_headline, Length: 75, dtype: object

Importing the libraries

In [5]:
import re
from bs4 import BeautifulSoup
import unicodedata
import spacy
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
ps = nltk.porter.PorterStemmer()
ls =nltk.stem.LancasterStemmer()
import requests 

import re
contractions_dict = {
    'didn\'t': 'did not',
    'don\'t': 'do not',
    "aren't": "are not",
    "can't": "cannot",
    "cant": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "didnt": "did not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont" : "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i had",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'm": "i am",
    "im": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who's": "who is",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
    }

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)



def strip_html_tags():
    soup = BeautifulSoup(content,"html.parser")
    [s.extract for s in soup(['iframe','script'])]
    stripped_text = soup.get_text() 
    stripped_text=re.sub('[\r|\n|\r\n]+','\n',stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

def remove_special_characters(text, remove_digits = False):
    patterns = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern,"",text)
    return text

def simple_stemmers(text,stemmer = ps):
    text = " ".join([stemmer.stem(word)for word in text.split()])
    return text

def expand_contraction(text):
    return contraction.fix(text)

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ !='-PRON-' else word.text for word in text])

def remove_stopwords(text, is_lower_case = False, stopwords = None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [tokens for token in tokens if token not in stopwords]
    else:
         filtered_tokens = [tokens for token in tokens if token.lower() not in stopwords]
    filtered_tokens = ' '.join(filtered_tokens)
    return filtered_tokens    

In [6]:
import tqdm

def text_pre_processor(text,html_strip=True, accented_char=True,contraction_expansion=True,text_lower_case=True,
                       text_stemming=False, text_lemmatization=True,special_char_removal=True, remove_digits=True,
                       stopword_removal=True, stopword_list=None): 
    #strip HTML
    if html_strip:
        text=strip_html_tags(text)
    
    #remove extra newlines(often might be present in really noisy text)
    text = text.translate(text.maketrans("\n\t\r"," "))

    #remove accented character
    if accented_char_removal:
        text = removal_accented_chars(text)
    
   #expand contraction
    if contraction_expansion:
         text = spacy_lemmatize_text(text)
    
   #Lemmatize text
    if text_lemmatization:
         text = spacy_lemmatize_text(text)
    
   #remove special characters and \or digits
    if special_char_removal:
   #insert space between special characters to isolate them
        special_char_pattern = re.compile(r'([{.(-)!}])')
        text = special_char_pattern.sub("\\1 ", text)
        text = remove_special_characters(text, remove_digits = remove_digits)
    
   #stem text
    if text_stemming and not text_lemmatization:
         text = simple_stemming(text)
    
   #lowercase the text
    if text_lower_case:
         text = text.lower()

   #remove stopwords
    if stopword_removal:
         text = remove_stopwords(text,is_lower_case = text_lower_case,stopwords=stopword_list)
    
   #remove extra whitespace
    text = re.sub(' +', ' ',text)
    text = text.strip()

    return text



In [7]:
from nltk.corpus import stopwords
import textblob
stop = stopwords.words('english')
from nltk.corpus import opinion_lexicon
pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())
from nltk.tokenize import treebank
tokenizer = treebank.TreebankWordTokenizer()
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn

In [8]:
def corpus_pre_processor(corpus):
    norm_corpus = []
    for doc in tqdm.tqdm(corpus):
        norm_corpus.append(text_pre_processor(doc))
    return norm_corpus

def normalize_document(doc):
    #Lowercase, remove special char \whitespace
    #remove stopwords
    #expand contraction
    words= word_tokenize(doc)
    doc = " ".join([word.lower() for word in words if word not in stop])
    doc = re.sub(r'[^a-zA-Z0-9\s]','',doc,re.I|re.A)
    doc =doc.strip()
    doc = expand_contractions(doc)
    return doc

In [9]:
news_df.reset_index(inplace = True)

In [10]:
#split the review into sentence
news=news_df[['news_headline','index']]
news.rename(columns ={'index':'INDEX'},inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [22]:
news.head()

Unnamed: 0,news_headline,INDEX
0,Xiaomi Mi 10 5G with 108MP camera to launch in...,0
1,"Airbnb fires 1,900 employees making 25% of its...",1
2,"Musk shares 1st pic of son with tattoo filter,...",2
3,₹1k fine or up to 6-months jail in Noida for n...,3
4,"Google, Apple ban location tracking in their j...",4


In [11]:
from nltk.tokenize import sent_tokenize
news['split'] = news['news_headline'].apply(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
news.head()

Unnamed: 0,news_headline,INDEX,split
0,"I'm selling almost all physical possessions, w...",0,"[I'm selling almost all physical possessions, ..."
1,Man tries to track 2 packets of snacks bought ...,1,[Man tries to track 2 packets of snacks bought...
2,Security lapse at Jio exposed COVID-19 symptom...,2,[Security lapse at Jio exposed COVID-19 sympto...
3,YouTube deletes conspiracy theorist David Icke...,3,[YouTube deletes conspiracy theorist David Ick...
4,Aarogya Setu adds 'Mitr' portal for free COVID...,4,[Aarogya Setu adds 'Mitr' portal for free COVI...


In [12]:
news_split=news.set_index('INDEX').split.apply(pd.Series).stack().reset_index(level=0).rename(columns={0:'news_headline'})
news_split.reset_index(level=0,inplace=True)
news_split.rename(columns={'INDEX':'headlines_no','index':'sentence'},inplace=True)

In [15]:
news_split.head(15)

Unnamed: 0,sentence,headlines_no,news_headline
0,0,0,"I'm selling almost all physical possessions, w..."
1,0,1,Man tries to track 2 packets of snacks bought ...
2,0,2,Security lapse at Jio exposed COVID-19 symptom...
3,0,3,YouTube deletes conspiracy theorist David Icke...
4,0,4,Aarogya Setu adds 'Mitr' portal for free COVID...
5,0,5,We may not make money this year but won't fire...
6,0,6,"Google, Apple ban location tracking in their j..."
7,0,7,"My girlfriend Grimes is mad at me, baby due on..."
8,0,8,"As always, I am optimistic about Tesla long-te..."
9,0,9,"US-based Silver Lake to invest ₹5,655.75 crore..."


In [13]:
#then normalizing the data
news_split['news_headline']=news_split['news_headline'].apply(normalize_document)

In [14]:
news_split.head()

Unnamed: 0,sentence,headlines_no,news_headline
0,0,0,xiaomi mi 10 5g 108mp camera launch india may 8
1,0,1,musk shares 1st pic son tattoo filter says ne...
2,0,2,apple reopen stores australia austria week
3,0,3,indians evacuated abroad download aarogya setu...
4,0,4,google apple ban location tracking joint covi...


In [14]:
def score(news_headline):
    from textblob import TextBlob
    return TextBlob(news_headline).sentiment.polarity
def predict(news_headline):
    news_split['score']=news_split['news_headline'].apply(score)
    return(news_split)

In [15]:
news_split.head()

Unnamed: 0,sentence,headlines_no,news_headline
0,0,0,airbnb fires 1900 employees making 25 global ...
1,0,1,gri ames explains name elon musk s first child...
2,0,2,indians evacuated abroad download aarogya setu...
3,0,3,amazon vp quit staff firings says google huaw...
4,0,4,wishing good vibes second half 2020 musk amid...


In [15]:
senti=predict(news_split)

In [23]:
senti

Unnamed: 0,sentence,headlines_no,news_headline,score
0,0,0,i m selling almost physical possessions wo nt...,0.000
1,0,1,man tries track 2 packets snacks bought online...,-0.200
2,0,2,security lapse jio exposed covid19 symptom che...,0.000
3,0,3,youtube deletes conspiracy theorist david icke...,0.000
4,0,4,aarogya setu adds mitr portal free covid19 co...,0.400
...,...,...,...,...
67,0,67,bangladesh reports biggest rise daily covid19 ...,0.000
68,0,68,vietnam reports first covid19 case 9 days tot...,0.125
69,0,69,bangladesh extends lockdown till may 16 covid...,0.000
70,0,70,8 mercenaries killed foiled invasion venezuel...,-0.200


## sentiment analysis with Text Blob

In [16]:
import textblob
textblob.TextBlob('I hate this file its not good').sentiment

Sentiment(polarity=-0.575, subjectivity=0.75)

In [17]:
senti['Sentiment']=['positive' if score >=0 else 'negative' for score in senti['score']]


In [18]:
news_headline=np.array(senti['news_headline'])
sentiment =np.array(senti['Sentiment'])
sample=[13,44,69]


In [31]:
news_headline

array(['i m selling almost physical possessions  wo nt house  billionaire musk',
       'man tries track 2 packets snacks bought online 400  loses 2l',
       'security lapse jio exposed covid19 symptom checker results online',
       'youtube deletes conspiracy theorist david icke s account covid19 clai ams',
       'aarogya setu adds mitr  portal free covid19 consultations  booking home lab tests',
       'blood covid19 survivors  sold vaccine darknet  researchers',
       'maskwearing robots greet covid19 patients japan hotels  wish recovery',
       'we may make money year wo nt fire anyone  us software firm s ceo',
       'report says xiaomi tracks users  private phone  web activity  company denies',
       'australian govt website reportedly leaks data 774000 migrants  criticised',
       'my girlfriend gri ames mad  baby due monday  billionaire elon musk',
       'usbased silver lake invest 565575 crore jio platforms 115  stake',
       'as always  i opti amistic tesla longterm 

In [137]:
sentiment

array(['positive', 'positive', 'negative', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'negative', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'negative', 'positive', 'positive', 'negative',
       'positive', 'positive', 'positive', 'positive', 'negative',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'negative', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'negative', 'negative', 'positive',
       'positive', 'positive', 'positive', 'negative', 'positive',
       'positive', 'positive', 'negative', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'negative',
       'positive', 'positive', 'positive', 'positive', 'positi

In [19]:
for news_headline,sentiment in zip (news_headline[sample],sentiment[sample]):
    print('NEWS_HEADLINE',news_headline)
    print('SENTIMENT',sentiment)
    print('Predicted Sentiment polarity',textblob.TextBlob(news_headline).sentiment.polarity)
    print('-'*60)

NEWS_HEADLINE france accuses apple refusing help covid19 tracking app
SENTIMENT positive
Predicted Sentiment polarity 0.0
------------------------------------------------------------
NEWS_HEADLINE extremely motivated play three formats  faf du plessis
SENTIMENT negative
Predicted Sentiment polarity -0.125
------------------------------------------------------------
NEWS_HEADLINE new zealand safe advantage  welcome investments  pm
SENTIMENT positive
Predicted Sentiment polarity 0.4787878787878788
------------------------------------------------------------


In [20]:
sentiment_polarity=[textblob.TextBlob(news_headline).sentiment.polarity for news_headline in news_headline]
predicted_sentiments=['positive'if score >= 0.1 else 'negative' for score in sentiment_polarity]

## sentiment analysis with AFINN

In [21]:
afn = Afinn(emoticons=True)

In [22]:
afn.score('I love it')

3.0

In [23]:
afnn=pd.DataFrame(news_df)

In [32]:
afnn

Unnamed: 0,index,news_headline,news_article,news_category
0,0,"Airbnb fires 1,900 employees making 25% of its...","Airbnb, the US-based startup that connects tra...",technology
1,1,Grimes explains the name of her and Elon Musk'...,A day after Tesla CEO Elon Musk said his baby ...,technology
2,2,Indians evacuated from abroad will have to dow...,"Indians stranded abroad, who will be brought b...",technology
3,3,Amazon VP who quit over staff firings says Goo...,The Amazon VP who quit over the firm's decisio...,technology
4,4,Wishing good vibes for all in second half 2020...,"Tesla's billionaire CEO Elon Musk, who recentl...",technology
...,...,...,...,...
70,70,Skies in Niger's capital turn red during sand ...,"Skies in Niamey, the capital city of Niger, tu...",world
71,71,Hong Kong will never be calm unless violent pr...,China's Hong Kong affairs office on Wednesday ...,world
72,72,Trump denies US role in 'mercenary incursion' ...,US President Donald Trump has said that his go...,world
73,73,Trump made 'stupid mistake' by exiting from nu...,"US President Donald Trump ""made a stupid mista...",world


In [24]:
afnn.loc[:,"news_headline"]

0     Uber to fire 3,700 employees worldwide, CEO no...
1     Airbnb fires 1,900 employees making 25% of its...
2     Grimes explains the name of her and Elon Musk'...
3     Indians evacuated from abroad will have to dow...
4     Amazon VP who quit over staff firings says Goo...
                            ...                        
70     Iran's state broadcaster influenced 2014 Scot...
71    3 rockets hit near Baghdad International Airpo...
72    Trump made 'stupid mistake' by exiting from nu...
73    Hong Kong will never be calm unless violent pr...
74    Skies in Niger's capital turn red during sand ...
Name: news_headline, Length: 75, dtype: object

In [25]:
def score(news_headline):
    from afinn import Afinn
    return afn.score(news_headline)
def predict(news_headline):
    afnn['score']=afnn['news_headline'].apply(score)
    return(afnn)

In [26]:
afnn_senti=predict(afnn)

In [27]:
afnn_senti['sentiment']=['positive' if score >=0 else 'negative' for score in afnn_senti['score']]

In [28]:
news_headline=np.array(afnn_senti['news_headline'])
sentiment =np.array(afnn_senti['sentiment'])
sample=[]

In [29]:
for news_headline,sentiment in zip (news_headline[sample],sentiment[sample]):
    print("NEWS_HEADLINE",news_headline)
    print("SENTIMENT",sentiment)
    print("Predicted Sentiment polarity",afn.score(news_headline).sentiment.polarity)
    print('-'*60)
   

In [30]:
sentiment_polarity=[textblob.TextBlob(news_headline).sentiment.polarity for news_headline in news_headline]
predicted_sentiments=['positive'if score >= 0.2 else 'negative' for score in sentiment_polarity]

## Sentiment Analyses using Vader

In [31]:
vader=pd.DataFrame(news_split)


In [32]:
vader.loc[:,"news_headline"]

0     uber fire 3700 employees worldwide  ceo take b...
1     airbnb fires 1900 employees making 25  global ...
2     gri ames explains name elon musk s first child...
3     indians evacuated abroad download aarogya setu...
4     amazon vp quit staff firings says google  huaw...
                            ...                        
71    iran s state broadcaster influenced 2014 scott...
72    3 rockets hit near baghdad international airpo...
73    trump made stupid mistake  exiting nuclear dea...
74    hong kong never calm unless violent protesters...
75    skies niger s capital turn red sand storm  vid...
Name: news_headline, Length: 76, dtype: object

In [33]:
def score(news_headline):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    vader = SentimentIntensityAnalyzer()
    return vader.polarity_scores(news_headline)['compound']
def predict(news_headline):
    vader['score']=vader['news_headline'].apply(score)
    return(vader)

In [34]:
vader_senti=predict(vader)

In [35]:
vader_senti['sentiment']=['positive' if score >=0 else 'negative' for score in vader_senti['score']]

In [36]:
news_headline=np.array(vader_senti['news_headline'])
sentiment =np.array(vader_senti['score'])
sample=[]

In [37]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [38]:
def analyze_sentiment_vader_lexicon(news_headline,
                                    threshold=0.1,
                                    verbose=False):
    #analyse the sentiment for review
    analyzer=SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(news_headline)
    #get aggregrate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'positive' if agg_score >= threshold\
                                    else 'negative'
    if verbose:
        #display detailed sentiment statistics
        positive = str(round(scores['pos'], 2)*100)+'%'
        final = round(agg_score, 2)
        negative = str(round(scores['neg'], 2)*100)+'%'
        neutral = str(round(scores['neu'], 2)*100)+'%'
        sentiment_frame = pd.DataFrame([[final_sentiment,final,positive,
                                        negative,neutral]],
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'],
                                                                      ['Predicted Sentiment','Polarity Score',
                                                                        'Positive', 'Negative' 'Neutral']],
                                                              codes=[[0,0,0,0,0],[0,1,2,3,4]]))
        print(sentiment_frame)
        
    return final_statement

In [39]:
for news_headline,sentiment in zip (news_headline[sample],sentiment[sample]):
    print('NEWS_HEADLINE',news_headline)
    print('SENTIMENT',sentiment)
    pred = analyze_sentiment_vader_lexicon (news_headline,threshold=0.4, verbose =True)
    print('-'*60)

In [40]:
sentiment_polarity=[textblob.TextBlob(news_headline).sentiment.polarity for news_headline in news_headline]
predicted_sentiments=['positive'if score >= 0.2 else 'negative' for score in sentiment_polarity]

# Supervised learning

Building a sentimental analysis

In [3]:
news_df = build_dataset(seed_urls)
news_df.info()

NameError: name 'requests' is not defined

In [73]:
news_df.head()

Unnamed: 0,news_headline,news_article,news_category
0,"Uber to fire 3,700 employees worldwide, CEO no...",Ride-hailing company Uber on Wednesday said it...,technology
1,"Airbnb fires 1,900 employees making 25% of its...","Airbnb, the US-based startup that connects tra...",technology
2,Grimes explains the name of her and Elon Musk'...,A day after Tesla CEO Elon Musk said his baby ...,technology
3,Amazon VP who quit over staff firings says Goo...,The Amazon VP who quit over the firm's decisio...,technology
4,Indians evacuated from abroad will have to dow...,"Indians stranded abroad, who will be brought b...",technology


In [8]:

from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
import re
contractions_dict = {
    'didn\'t': 'did not',
    'don\'t': 'do not',
    "aren't": "are not",
    "can't": "cannot",
    "cant": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "didnt": "did not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont" : "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i had",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'm": "i am",
    "im": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who's": "who is",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
    }

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)


def pre_process_corpus(docs):
    norm_docs = []
    for doc in tqdm.tqdm(docs):
        doc = strip_html_tags(doc)
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = expand_contractions(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()  
        norm_docs.append(doc)
  
    return norm_docs


import tqdm
def corpus_pre_processor(corpus):
    norm_corpus = []
    for doc in tqdm.tqdm(corpus):
        norm_corpus.append(normalize_document(doc))
    return norm_corpus
def normalize_corpus(doc):
    #Lowercase, remove special char \whitespace
    #remove stopwords
    #expand contraction
    words= word_tokenize(doc)
    doc = " ".join([word.lower() for word in words if word not in stop])
    doc = re.sub(r'[^a-zA-Z0-9\s]','',doc,re.I|re.A)
    doc =doc.strip()
    doc = expand_contractions(doc)
    return doc


In [7]:
#dividing ito train and test
headline= news_df['news_headline'].values
category = news_df['news_category'].values

In [8]:
train_headline=headline[:60]
train_category = category[:60]

test_headline = headline[60:]
test_category = category[60:]

In [9]:
%%time
norm_train_headlines = pre_process_corpus(train_headline)
norm_test_headlines = pre_process_corpus(test_headline)

100%|██████████| 60/60 [00:00<00:00, 3117.48it/s]
100%|██████████| 15/15 [00:00<00:00, 2233.55it/s]

Wall time: 78.3 ms





In [10]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

#biuld BOW
cv = CountVectorizer(binary = False, min_df = 5, max_df = 1.0, ngram_range=(1,2))

cv_train_features = cv.fit_transform(norm_train_headlines)


Wall time: 3.02 s


In [11]:
cv_test_features = cv.transform(norm_test_headlines)

## Logistic Regression

In [12]:
%%time

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2',max_iter=500,C=1,solver='lbfgs')
lr.fit (cv_train_features,train_category)

Wall time: 1.16 s




LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
lr_predictions = lr.predict(cv_test_features)

In [15]:
from sklearn.metrics import confusion_matrix,classification_report
labels = ['sports','technology','world']
print(classification_report(test_category,lr_predictions))

              precision    recall  f1-score   support

      sports       0.00      0.00      0.00       0.0
  technology       0.00      0.00      0.00       0.0
       world       0.00      0.00      0.00      15.0

    accuracy                           0.00      15.0
   macro avg       0.00      0.00      0.00      15.0
weighted avg       0.00      0.00      0.00      15.0



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
labels = ['sports','technology','world']
pd.DataFrame(confusion_matrix(test_category,lr_predictions),index = labels,columns = labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_category,lr_predictions)

For TFID

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tv = TfidfVectorizer(use_idf =True, min_df=5,max_df=1.0,ngram_range=(1,2))

In [19]:
%%time
norm_train_reviews = pre_process_corpus(train_headline)
norm_test_reviews = pre_process_corpus(test_headline)

100%|██████████| 60/60 [00:00<00:00, 3198.83it/s]
100%|██████████| 15/15 [00:00<00:00, 3364.24it/s]

Wall time: 33.1 ms





In [20]:
tv_train_features = tv.fit_transform(norm_train_headlines)
tv_test_features = tv.transform(norm_test_headlines)

In [21]:
lr.fit(tv_train_features,train_category)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
lr_predictions = lr.predict(tv_test_features)

In [34]:

from sklearn.metrics import confusion_matrix, classification_report

labels = ['sports','technology','world']
print(classification_report(test_category,lr_predictions))

              precision    recall  f1-score   support

      sports       0.00      0.00      0.00       0.0
  technology       0.00      0.00      0.00       0.0
       world       0.00      0.00      0.00      15.0

    accuracy                           0.00      15.0
   macro avg       0.00      0.00      0.00      15.0
weighted avg       0.00      0.00      0.00      15.0



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [35]:
labels = ['sports','technology','world']
pd.DataFrame(confusion_matrix(test_category,lr_predictions),index = labels,columns = labels)

Unnamed: 0,sports,technology,world
sports,0,0,0
technology,0,0,0
world,6,9,0


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_category,lr_predictions)

## Naive bayes

In [9]:
#dividing ito train and test
headline_N= news_df['news_headline'].values
category_N = news_df['news_category'].values

In [10]:
train_headline_N=headline_N[:60]
train_category_N = category_N[:60]

test_headline_N = headline_N[60:]
test_category_N = category_N[60:]

In [11]:
%%time
norm_train_headlines_N = pre_process_corpus(train_headline_N)
norm_test_headlines_N = pre_process_corpus(test_headline_N)

100%|██████████| 60/60 [00:00<00:00, 626.68it/s]
100%|██████████| 15/15 [00:00<00:00, 1880.12it/s]

Wall time: 130 ms





In [24]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

#biuld BOW
cv = CountVectorizer(binary = False, min_df = 5, max_df = 1.0, ngram_range=(1,2))

cv_train_features_N = cv.fit_transform(norm_train_headlines_N)

Wall time: 6.98 ms


In [25]:
cv_test_features_N = cv.transform(norm_test_headlines_N)

In [37]:
from sklearn.naive_bayes import GaussianNB
lr_N=GaussianNB()

In [38]:
lr_N.fit(cv_train_features_N,train_category_N)

GaussianNB(priors=None, var_smoothing=1e-09)

In [40]:
lr_predictions_N = lr.predict(cv_test_features_N)

In [42]:
from sklearn.metrics import accuracy_score
accuracy_score(test_category_N,lr_predictions_N)

0.9333333333333333

In [43]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['sports','technology','world']
print(classification_report(test_category_N,lr_predictions_N))

              precision    recall  f1-score   support

      sports       0.00      0.00      0.00         0
       world       1.00      0.93      0.97        15

    accuracy                           0.93        15
   macro avg       0.50      0.47      0.48        15
weighted avg       1.00      0.93      0.97        15



  'recall', 'true', average, warn_for)


In [45]:
labels = ['sports','world']
pd.DataFrame(confusion_matrix(test_category_N,lr_predictions_N),index = labels,columns = labels)

Unnamed: 0,sports,world
sports,0,0
world,1,14


For tfifd

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(use_idf =True, min_df=5,max_df=1.0,ngram_range=(1,2))

In [47]:
%%time
norm_train_reviews_N = pre_process_corpus(train_headline_N)
norm_test_reviews_N = pre_process_corpus(test_headline_N)

100%|██████████| 60/60 [00:00<00:00, 2148.59it/s]
100%|██████████| 15/15 [00:00<00:00, 1501.18it/s]

Wall time: 95.7 ms





In [52]:

tv_train_features_N = tv.fit_transform(norm_train_headlines_N)
tv_test_features_N = tv.transform(norm_test_headlines_N)

In [57]:

lr_N.fit(tv_train_features_N,train_category_N)


GaussianNB(priors=None, var_smoothing=1e-09)

In [62]:
lr_predictions_Nt = lr.predict(tv_test_features_N)

In [63]:
from sklearn.metrics import accuracy_score
accuracy_score(test_category_N,lr_predictions_Nt)

0.9333333333333333

In [64]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['sports','technology','world']
print(classification_report(test_category_N,lr_predictions_Nt))

              precision    recall  f1-score   support

      sports       0.00      0.00      0.00         0
       world       1.00      0.93      0.97        15

    accuracy                           0.93        15
   macro avg       0.50      0.47      0.48        15
weighted avg       1.00      0.93      0.97        15



  'recall', 'true', average, warn_for)


In [65]:
labels = ['sports','world']
pd.DataFrame(confusion_matrix(test_category_N,lr_predictions_Nt),index = labels,columns = labels)

Unnamed: 0,sports,world
sports,0,0
world,1,14


## Decision Tree

In [66]:
#dividing ito train and test
headline_D= news_df['news_headline'].values
category_D = news_df['news_category'].values

In [67]:
train_headline_D=headline_D[:60]
train_category_D = category_D[:60]

test_headline_D = headline_D[60:]
test_category_D = category_D[60:]

In [68]:
%%time
norm_train_headlines_D = pre_process_corpus(train_headline_D)
norm_test_headlines_D = pre_process_corpus(test_headline_D)

100%|██████████| 60/60 [00:00<00:00, 5468.57it/s]
100%|██████████| 15/15 [00:00<00:00, 4998.38it/s]

Wall time: 20 ms





In [69]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

#biuld BOW
cv = CountVectorizer(binary = False, min_df = 5, max_df = 1.0, ngram_range=(1,2))

cv_train_features_D = cv.fit_transform(norm_train_headlines_D)

Wall time: 4.01 ms


In [70]:
cv_test_features_D = cv.transform(norm_test_headlines_D)

In [71]:
from sklearn.tree import DecisionTreeClassifier
lr_D=DecisionTreeClassifier(criterion='entropy')

In [72]:
lr_D.fit(cv_train_features_D,train_headline_D)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [74]:
cv_test_features_D=cv_test_features_D.toarray()
lr_predictions_D = lr.predict(cv_test_features_D)

In [75]:
from sklearn.metrics import accuracy_score
accuracy_score(test_category_D,lr_predictions_D)

0.9333333333333333

In [77]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['sports','technology','world']
print(classification_report(test_category_D,lr_predictions_D))

              precision    recall  f1-score   support

      sports       0.00      0.00      0.00         0
       world       1.00      0.93      0.97        15

    accuracy                           0.93        15
   macro avg       0.50      0.47      0.48        15
weighted avg       1.00      0.93      0.97        15



  'recall', 'true', average, warn_for)


In [79]:
labels = ['sports','world']
pd.DataFrame(confusion_matrix(test_category_D,lr_predictions_D),index = labels,columns = labels)

Unnamed: 0,sports,world
sports,0,0
world,1,14


For tfifd

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(use_idf =True, min_df=5,max_df=1.0,ngram_range=(1,2))

In [81]:
%%time
norm_train_reviews_DT = pre_process_corpus(train_headline_D)
norm_test_reviews_DT = pre_process_corpus(test_headline_D)

100%|██████████| 60/60 [00:00<00:00, 5468.10it/s]
100%|██████████| 15/15 [00:00<00:00, 5012.31it/s]

Wall time: 20.9 ms





In [82]:

tv_train_features_DT = tv.fit_transform(norm_train_headlines_D)
tv_test_features_DT = tv.transform(norm_test_headlines_D)

In [88]:
tv_train_features_DT=tv_train_features_DT.toarray()
lr_N.fit(tv_train_features_DT,train_category_D)

GaussianNB(priors=None, var_smoothing=1e-09)

In [90]:
tv_test_features_DT=tv_test_features_DT.toarray()
lr_predictions_Dt = lr.predict(tv_test_features_DT)

In [91]:
from sklearn.metrics import accuracy_score
accuracy_score(test_category_D,lr_predictions_Dt)

0.9333333333333333

In [92]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['sports','technology','world']
print(classification_report(test_category_D,lr_predictions_Dt))

              precision    recall  f1-score   support

      sports       0.00      0.00      0.00         0
       world       1.00      0.93      0.97        15

    accuracy                           0.93        15
   macro avg       0.50      0.47      0.48        15
weighted avg       1.00      0.93      0.97        15



  'recall', 'true', average, warn_for)


In [93]:
labels = ['sports','world']
pd.DataFrame(confusion_matrix(test_category_D,lr_predictions_Dt),index = labels,columns = labels)

Unnamed: 0,sports,world
sports,0,0
world,1,14
