In [None]:
# Bipolar Factory Intership Problem Statement Solution 
# Date : 26/04/2020
# Time: 7:20pm

# News Scraping 

In [76]:
import requests  
r = requests.get('https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html',timeout=20)

In [77]:
print(r.text[0:500])

<!DOCTYPE html>
<!--[if (gt IE 9)|!(IE)]> <!--><html lang="en" class="no-js page-interactive section-opinion page-theme-standard tone-opinion page-interactive-default limit-small layout-xlarge app-interactive" itemid="https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html" itemtype="http://schema.org/NewsArticle" itemscope xmlns:og="http://opengraphprotocol.org/schema/"><!--<![endif]-->
<!--[if IE 9]> <html lang="en" class="no-js ie9 lt-ie10 page-interactive section-opinion page


In [78]:
from bs4 import BeautifulSoup  
soup = BeautifulSoup(r.text, 'html.parser')  

In [79]:
results = soup.find_all('span', attrs={'class':'short-desc'})

In [80]:
len(results)

180

In [81]:
results[0:3]

[<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 21 </strong>“A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.” <span class="short-truth"><a href="http://nation.time.com/2013/11/06/10-things-you-didnt-know-about-time/" target="_blank">(Trump was on the cover 11 times and Nixon appeared 55 times.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 23 </strong>“Between 3 million and 5 million illegal votes caused me to lose the popular vote.” <span class="short-truth"><a href="https://www.nytimes.com/2017/01/23/us/politics/donald-trump-congress-democrats.html" target="_

# Inserting the news headlines with date and other information into a CSV file

In [82]:
records = []  
for result in results:  
    date = result.find('strong').text[0:-1] + ', 2017'
    lie = result.contents[1][1:-2]
    explanation = result.find('a').text[1:-1]
    url = result.find('a')['href']
    records.append((date, lie, explanation, url))

import pandas as pd  
df = pd.DataFrame(records, columns=['date', 'lie', 'explanation', 'url'])  
df['date'] = pd.to_datetime(df['date'])  
df.to_csv('trump_lies.csv', index=False, encoding='utf-8') 

In [83]:
import csv
import re
from sklearn.naive_bayes import MultinomialNB
df = pd.read_csv('trump_lies.csv')
df.head()

Unnamed: 0,date,lie,explanation,url
0,2017-01-21,I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,2017-01-21,A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,2017-01-23,Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...
3,2017-01-25,"Now, the audience was the biggest ever. But th...",Official aerial photos show Obama's 2009 inaug...,https://www.nytimes.com/2017/01/21/us/politics...
4,2017-01-25,Take a look at the Pew reports (which show vot...,The report never mentioned voter fraud.,https://www.nytimes.com/2017/01/24/us/politics...


# As we need to know about the headlines that can go viral we will remove other columns from csv

In [84]:
records = []  
for result in results:  
    news = result.contents[1][1:-2]
    records.append((news))
import pandas as pd  
df = pd.DataFrame(records, columns=['news'])   
df.to_csv('trump_lies.csv', index=False, encoding='utf-8') 

# Let's classify our news whether they will get viral or not

In [85]:
from sklearn.svm import LinearSVC

In [86]:
records[:5]

["I wasn't a fan of Iraq. I didn't want to go into Iraq.",
 'A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.',
 'Between 3 million and 5 million illegal votes caused me to lose the popular vote.',
 'Now, the audience was the biggest ever. But this crowd was massive. Look how far back it goes. This crowd was massive.',
 'Take a look at the Pew reports (which show voter fraud.)']

In [87]:
len(records)

180

# We have taken other news headline dataset just to train our model and finally apply that training to our SCRAPED DATASET, So we have used 2 datasets one for training and other for testing

In [88]:
with open("viral.txt",encoding="utf-8") as f:
    lines = f.read().strip().split("\n")
    lines = [line.split("\t") for line in lines]
headlines, labels = zip(*lines)

In [89]:
headlines[:5]

("Egypt's top envoy in Iraq confirmed killed",
 'Carter: Race relations in Palestine are worse than apartheid',
 'After Years Of Dutiful Service, The Shiba Who Ran A Tobacco Shop Retires',
 'In Books on Two Powerbrokers, Hints of the Future',
 'These Horrifyingly Satisfying Photos Of "Baby Foot" Will Haunt You')

In [90]:
labels[:5]

('0', '0', '1', '0', '1')

In [91]:
# 1= Viral Headline
# 0 = Non Viral Headline

# Splitting into training and testing dataset

In [92]:
len(headlines)

10000

In [93]:
train_headlines = headlines[:8000]
test_headlines = records[1:180]
train_labels = labels[:8000]

In [94]:
# Create a vectorizer and classifier
vectorizer = TfidfVectorizer()
svm = LinearSVC()

In [95]:
# Transform our text data into numerical vectors
train_vectors = vectorizer.fit_transform(train_headlines)
test_vectors = vectorizer.transform(test_headlines)

In [96]:
# Train the classifier and predict on test set
svm.fit(train_vectors, train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [97]:
predictions = svm.predict(test_vectors)

In [98]:
test_headlines[:5]

['A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.',
 'Between 3 million and 5 million illegal votes caused me to lose the popular vote.',
 'Now, the audience was the biggest ever. But this crowd was massive. Look how far back it goes. This crowd was massive.',
 'Take a look at the Pew reports (which show voter fraud.)',
 "You had millions of people that now aren't insured anymore."]

# The final output showing whether the scraped headline will go viral or not.

In [99]:
predictions[:180]

array(['1', '0', '1', '1', '1', '1', '1', '0', '1', '1', '0', '1', '0',
       '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1',
       '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0',
       '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '0', '1',
       '0', '0', '1', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '0', '0', '1', '1', '1', '1', '1', '1'], dtype=

# As you can see there is a lot of '1' which shows that the news will get viral and '0' it won't. The lot of '1' indicate as our test dataset is from New York times and related to America's President Trump, So probably it is showing the correct result.

# But we can check the accuracy of our trained model using labelled data 

In [101]:
# Break viral dataset into test and split
train_headlines = headlines[:8000]
test_headlines = headlines[8000:]

train_labels = labels[:8000]
test_labels = labels[8000:]

In [102]:
train_vectors = vectorizer.fit_transform(train_headlines)
test_vectors = vectorizer.transform(test_headlines)

In [103]:
predictions = svm.predict(test_vectors)

In [104]:
test_headlines[0:5]

('The Earliest I\'ve Said "I Love You"',
 "Stop What You're Doing And Worship These Matt Bomer Pictures",
 '23 Of The Funniest "Nancy Drew" Game Memes',
 'Policeman killed in football-related violence in Italy',
 'Do You Remember Which Disney Star Sang These Lyrics')

In [105]:
predictions[:5]

array(['1', '1', '1', '0', '1'], dtype='<U1')

In [106]:
test_labels[:5]

('1', '1', '1', '0', '1')

In [107]:
accuracy_score(test_labels, predictions)

0.961

# So the accuracy of our model is 96% for labelled data and for unlabelled data we have provide label whether it will go viral or not just above. I hope this will somehow justify your problem statement. I thankyou for this opportunity.