##**Bipolar Factory Internship Assessment.**
##Date of Submission: 27rd April 2020
#### Tamil Sudarvan M - IIT Madras

In [0]:
#Importing Required Libraries
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# **Scrapping Viral and Latest Headlines From NDTV Website**

In [0]:
# Function to scrap headlines from the given URL
def webScrapper(pageNo,url,query=''):
  data = []
  for i in range (pageNo):
    if len(query) > 0: 
      urlFinal = url + str(i) + "&query=" + query
    else:
      urlFinal = url + str(i)
    r = requests.get(urlFinal)
    soup = BeautifulSoup(r.content, 'html5lib') 
    table = soup.findAll('p', attrs = {'class':'header fbld'})
    for row in table:
      data.append(row.get_text())
  return data

In [0]:
pageNo = 100
url = "https://www.ndtv.com/page/topic-load-more?%20type=news&page="
headLinesViral  = webScrapper(pageNo,url,'viral') #Viral Headlines
headLinesLatest = webScrapper(pageNo,url)         #Latest Headlines
headLinesTest   = webScrapper(25,url,'business')  #Test Headlines from Business

In [22]:
print(len(headLinesViral),len(headLinesLatest),len(headLinesTest))
Y =  np.asarray(len(headLinesViral)*[1] + len(headLinesLatest)*[0]) #

1015 1015 375


In [23]:
headLinesViral[:10]

['Viral: A Petition To #UninstallWhatsApp For Amitabh Bachchan, Anand Mahindra',
 'Viral Video Of Dolphins In Meerut Stuns Internet. Watch',
 'Watch: Stuntmen Choreograph Hilarious Fight Sequence While Social Distancing',
 "E-Rickshaw Driver's Social Distancing Innovation Wins Anand Mahindra's Praise",
 "Watch: RCB Director Mike Hesson Mesmerised By 6-Year-Old Indian Girl's Batting Skills",
 "Ankita Lokhande's Mushy Post With Boyfriend Vicky Jain Is Just Too Cute",
 "Viral: When Shah Rukh Khan Attended Kajol's Mehndi With Gauri And Aryan",
 'Sobhita Dhulipala, Accused Of Faking A "Self-Timed" Photoshoot, Writes About "Unkind Conclusions"',
 'Pet Owner Puts Dog On Car Roof, Drives Around City. Watch',
 'Viral Pic Shows Cops Sleeping On Ground, Twitter Thanks #CoronaWarriors']

In [24]:
headLinesLatest[:10]

['Ronaldinho Says Arrest And Confinement Has Been "Hard"',
 'Coronavirus - Mumbai Schools To Become Quarantine Centres As COVID-19 Cases Cross 5,500',
 'Lucknow Police Sends Notice To Kanika Kapoor For Questioning',
 'Advance Summer Vacation To May 1: Ghaziabad Administration To School Education Director',
 'New Symptoms Of COVID-19 Identified By Top US Medical Watchdog',
 'Indian Couple In UAE Die 5-Days Apart, Tested Negative For COVID-19: Report',
 'Indian-American Congresswoman Endorses Joe Biden For US President',
 'After Row Over Tax Hike Advice, Action Against 3 Senior Officers',
 'Tamil Nadu To Return 24,000 Rapid Testing Kits To China',
 'Sunil Lahri And Anuradha Patel In A Rare Pic From 1985 Film Phir Aayee Barsat']

In [25]:
headLinesTest[:10]

["Why Government's DA Decision Will Hurt, Not Help, Economy",
 'India Weighs Plan to Guarantee $39 Billion of Small-Business Loans: Report',
 'Lockdown Will Stay In Red Zones; Push "Do Gaz Doori", Says PM: 10 Points',
 'Sensex, Nifty Likely To Open Higher On Positive Global Cues',
 "E-Rickshaw Driver's Social Distancing Innovation Wins Anand Mahindra's Praise",
 'Daimler Says China Business Picks Up Again: Report',
 'Indian Car Makers Assure Employees About No Salary Cut Or Job Loss Post Lockdown',
 'SAARC Nations Roll Out Stimulus Packages To Tackle COVID-19 Economic Fallout',
 '"Knock Knock. Who\'s There?" Not Sara Ali Khan And Ibrahim. They Are Busy Working Out',
 "Sonia Gandhi Wants Credit Of Centre's Efforts For Small Businesses: Prakash Javadekar"]

In [0]:
newsData = pd.DataFrame(headLinesViral + headLinesLatest,columns =['News_Data'])
newsData_test = pd.DataFrame(headLinesTest,columns =['News_Data'])

In [27]:
newsData.head()

Unnamed: 0,News_Data
0,Viral: A Petition To #UninstallWhatsApp For Am...
1,Viral Video Of Dolphins In Meerut Stuns Intern...
2,Watch: Stuntmen Choreograph Hilarious Fight Se...
3,E-Rickshaw Driver's Social Distancing Innovati...
4,Watch: RCB Director Mike Hesson Mesmerised By ...


In [28]:
newsData_test.head()

Unnamed: 0,News_Data
0,"Why Government's DA Decision Will Hurt, Not He..."
1,India Weighs Plan to Guarantee $39 Billion of ...
2,"Lockdown Will Stay In Red Zones; Push ""Do Gaz ..."
3,"Sensex, Nifty Likely To Open Higher On Positiv..."
4,E-Rickshaw Driver's Social Distancing Innovati...


## **Pre-Processing the data we got**

In [29]:
import re
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
#Function to remove non-alphanumericals
def remove_special_characters(data):
  newData = []
  for text in data:
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    newData.append(text)
  return newData

In [0]:
# Lemmatizing
def get_lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word, pos="v") for word in review.split()]) for review in corpus]

In [0]:
def process_data(data):
  data_wsc = remove_special_characters(data)
  lemm_data = get_lemmatized_text(data_wsc)
  return(lemm_data)

In [0]:
# Getting processed Data
newsData['Processed_News'] = process_data(newsData['News_Data'])
newsData_test['Processed_News'] = process_data(newsData_test['News_Data'])

In [34]:
newsData.head()

Unnamed: 0,News_Data,Processed_News
0,Viral: A Petition To #UninstallWhatsApp For Am...,Viral A Petition To UninstallWhatsApp For Amit...
1,Viral Video Of Dolphins In Meerut Stuns Intern...,Viral Video Of Dolphins In Meerut Stuns Intern...
2,Watch: Stuntmen Choreograph Hilarious Fight Se...,Watch Stuntmen Choreograph Hilarious Fight Seq...
3,E-Rickshaw Driver's Social Distancing Innovati...,ERickshaw Drivers Social Distancing Innovation...
4,Watch: RCB Director Mike Hesson Mesmerised By ...,Watch RCB Director Mike Hesson Mesmerised By 6...


In [35]:
newsData_test.head()

Unnamed: 0,News_Data,Processed_News
0,"Why Government's DA Decision Will Hurt, Not He...",Why Governments DA Decision Will Hurt Not Help...
1,India Weighs Plan to Guarantee $39 Billion of ...,India Weighs Plan to Guarantee 39 Billion of S...
2,"Lockdown Will Stay In Red Zones; Push ""Do Gaz ...",Lockdown Will Stay In Red Zones Push Do Gaz Do...
3,"Sensex, Nifty Likely To Open Higher On Positiv...",Sensex Nifty Likely To Open Higher On Positive...
4,E-Rickshaw Driver's Social Distancing Innovati...,ERickshaw Drivers Social Distancing Innovation...


# **Training the Model And Predicting the result**

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [0]:
# Function for returning Ngram Model

def Ngram(review,review_test,maxRange=2):
  ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, maxRange))
  ngram_vectorizer.fit(review)
  X = ngram_vectorizer.transform(review)
  X_test = ngram_vectorizer.transform(review_test)
  return(X,X_test)

# Function for returning TF - IDF (Term Frequency — Inverse Document Frequency) vectoriser the given data

def TF_idf(review,review_test):
  tfidf_vectorizer = TfidfVectorizer()
  tfidf_vectorizer.fit(review)
  X = tfidf_vectorizer.transform(review)
  X_test = tfidf_vectorizer.transform(review_test)
  return(X,X_test)

In [0]:
#spiltting the datapoints into train and validation set

X,X_test = TF_idf(newsData['Processed_News'],newsData_test['Processed_News'])
#X,X_test = Ngram(newsData['Processed_News'],newsData_test['Processed_News'],2)
X_train,X_val,y_train,y_val = train_test_split(X, Y, train_size=0.7)

In [39]:
#Logistic regression model to train the data

lr_model = LogisticRegression(C=1)
lr_model.fit(X_train,y_train)
lr_predict = lr_model.predict(X_val)

#Validating the model's accuracy of Logistic Regression

print('Validation F1-score : ' + str(f1_score(lr_predict,y_val)))
print('Validation Accuracy : ' + str(accuracy_score(lr_predict,y_val)))

Validation F1-score : 0.8532883642495784
Validation Accuracy : 0.8571428571428571


In [40]:
# SVM model to train the data

svm = LinearSVC()
svm.fit(X_train, y_train)
pred_svm = svm.predict(X_val)

#Validating the model's accuracy of SVM Model

print('Validation F1-score : ' + str(f1_score(pred_svm,y_val)))
print('Validation Accuracy : ' + str(accuracy_score(pred_svm,y_val)))

Validation F1-score : 0.8685524126455907
Validation Accuracy : 0.8702791461412152


# **Predicting Virality of News using Logistic Regression**

In [42]:
# Saving the prediction to 
lr_model.fit(X,Y)
y_test_predict = lr_model.predict_proba(X_test)[:,1]
y_predict_df = pd.DataFrame({'News':headLinesTest,'Likelihood of Virality':y_test_predict*100})
y_predict_df.to_csv('Virality Predictions.csv')
y_predict_df.head()

Unnamed: 0,News,Likelihood of Virality
0,"Why Government's DA Decision Will Hurt, Not He...",32.094474
1,India Weighs Plan to Guarantee $39 Billion of ...,19.974552
2,"Lockdown Will Stay In Red Zones; Push ""Do Gaz ...",16.596383
3,"Sensex, Nifty Likely To Open Higher On Positiv...",17.557001
4,E-Rickshaw Driver's Social Distancing Innovati...,57.453482
