In [25]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import sys
import string
import nltk
import pandas as pd
import pickle

from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Custom modules 
from modules import preprocessing as pp
from modules import graph
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/TjH/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
stop_words = set(stopwords.words("english"))
# Extend stopwords (see analysis below)
extension = {
    'trumps',
    'trump',
    'obama',
    'donald',
    'new',
    'u',
    'tramp'
}
stop_words.update(extension)

In [27]:
# The first function just has to access a url and return a list of all headlines
url = 'https://twitter.com/home'
response = requests.get(url)

def get_headlines(response_text, tags=['h1', 'h2', 'h3', 'h4']):
    soup = BeautifulSoup(response_text, 'lxml')
    headers = soup.find_all(tags)
    return [header.text for header in headers]

def clean_headlines(title, length):
#     if len(title.split()) >= length:
#         return None
#     else:
        # strip newline characters
    title = title.replace("\n", "")
    title = title.replace("\t", "")
    title = pp.remove_non_ascii_chars(title)
    title = pp.lower_case(title)
    title = pp.remove_contractions(title)
    title = pp.lemmetise_series(title)
    title = "".join([char for char in title if char not in string.punctuation])
    # remove stopwords
    title = " ".join([char for char in tokenizer.tokenize(title) if char not in stop_words ])
    if len(title.split()) < length:
         return None

    return title
# Convert to ascii
# lower case 
# remove everything that is not printable.
    
def get_cleaned_headlines(url, length=3, tags=['h1', 'h2', 'h3']):
    text = requests.get(url).text
    return [clean_headlines(headline, length) for headline in get_headlines(text, tags=tags)]

def convert_list_to_X(cleaned_headlines, pipeline):
    # Convert list to a pandas sereios
    series = pd.Series(cleaned_headlines, name='title')
    X = pipeline.fit(X)
    return X


In [28]:
test = pd.Series(clean_headlines('clickbait is cancer', 2), name='title')

In [29]:
url = 'https://www.nytimes.com/'
cleaned_headlines = pd.Series(pp.get_cleaned_headlines(url), name='title')
series = cleaned_headlines.dropna()
series

1                                    neediest case fund
2                             got confidential news tip
3     electoral college track formalize bidens victo...
4     key swing state affirm biden win despite pressure
5     long honor serving electoral college voter als...
6     us virus death toll cross 300000 vaccination b...
7           day hope reminder viruss devastating impact
8                 company require employee take vaccine
9     political infighting haphazard planning turned...
10                       2020 year sport everybody lost
11              restaurant thought 2020 could get worse
12    one study pregnant woman covid 19 experienced ...
13            american stuck home trade china roar back
14    john le carre master spy novel real action wa ...
15          life defied alzheimers death brain may show
17                    people actually pretty great year
19                          get herd immunity fake news
20             autonomous vehicle take another b

In [20]:
f = open('./../../src/models/model1.pickle', 'rb')
clf = pickle.load(f)
f = open('./../../src/models/tfidf1.pickle', 'rb')
tfidf = pickle.load(f)

In [21]:
test = pd.Series(pp.clean_headlines('clickbait is cancer here are 25 what you can do to change it', 2), name='title')

In [22]:
X = series

X_tfidf = tfidf.transform(X)
predictions = clf.predict(X_tfidf)


In [23]:
predictions.sum()/predictions.shape[0]

0.3448275862068966

In [24]:
target = pd.Series(predictions, name='target')

df = pd.DataFrame(list((zip(series, target))), columns=['title', 'target'])
df[df.target == 1].title


1                             got confidential news tip
6           day hope reminder viruss devastating impact
9                        2020 year sport everybody lost
10              restaurant thought 2020 could get worse
11    one study pregnant woman covid 19 experienced ...
13    john le carre master spy novel real action wa ...
16                    people actually pretty great year
18                                book really easy wrap
19                          ha never believed democracy
24                      test lifetimes let stop failing
Name: title, dtype: object

In [5]:
clean_headlines("Timothée Chalamet’s “SNL” Impression Of Harry Styles Is Doing Weird Things To Me", length=3)

'timothee chalamets snl impression harry style weird thing'

In [6]:
get_headlines(response.text)

['Your Weekend Briefing',
 'Listen to ‘The Sunday Read’',
 'The Neediest Cases Fund',
 'White House Staff Will Be Among the First in the U.S. to Get Vaccinated',
 ' ',
 '\n\n\t\t\t\t\t\t\t\tNew Reported Cases in the U.S.\n\t\t\t\t\t\t\t\n',
 '\n\n\t\t\t\t\t\t\t\tHow New Cases Are Changing by Day\n\t\t\t\t\t\t\t\n',
 'After the Students Came Back, Deaths Rose in College Towns',
 'These health pass apps could help reopen businesses, but could also exclude people from travel and workplaces.',
 '2020 Was Especially Deadly. Covid Wasn’t the Only Culprit.',
 'Germany is going into lockdown ahead of Christmas, closing stores and schools and restricting meetings.',
 'Despite early worries, the threat of dueling flu and coronavirus outbreaks may be waning.',
 'Tracking the Coronavirus ›',
 'Where cases per capita are\n\t\thighest',
 'U.S. hot spots ›',
 'College cases ›',
 'Worldwide ›',
 'Other trackers:\nChoose your own places to track',
 'Other trackers:',
 'U.S. hot spots ›',
 'Worldwide ›'

In [7]:
pp.remove_contractions(pp.lower_case(pp.remove_non_ascii_chars('Things From Amazon That’ll Make Perfect Gifts')))

'things from amazon that will make perfect gifts'

True

In [9]:
def print_headlines(response_text):
    soup = BeautifulSoup(response_text, 'lxml')
    headlines = soup.find_all(attrs={"itemprop": "headline"})
    for headline in headlines:
        print(headline.text)
        
def print_text(response_text):
    soup = BeautifulSoup(response_text, 'lxml')
    text = soup.find_all("h1")
    print(text)

In [10]:
url = 'https://www.cnn.com/2020/12/13/health/us-coronavirus-sunday/index.html'
response = requests.get(url)
print_headlines(response.text)




In [11]:
print_text(response.text)

[<h1 class="pg-headline">CDC officially allows coronavirus vaccine to be administered as shipments begin in US</h1>]


In [12]:
def replace(string, replacement_message, html):
    response = requests.get(html)
    soup = BeautifulSoup(response.text)
    nodes_to_censor = soup.findAll(text=re.compile(string))
    for node in nodes_to_censor:
        node.replaceWith(replacement_message)
        print(node)

__file__ = 'buffer.html'
base = os.path.dirname(os.path.abspath(__file__))
html = open(os.path.join(base, 'buffer.html'))
soup = BeautifulSoup(html, 'html.parser')

for i in soup.find('div', {"id":None}).findChildren():
    i.replace_with('##')

with open("example_modified.html", "wb") as f_output:
    f_output.write(soup.prettify("utf-8"))  

AttributeError: 'NoneType' object has no attribute 'findChildren'

In [None]:
__file__ = 'buffer.html'
base = os.path.dirname(os.path.abspath(__file__))
base

In [None]:
os.path.abspath('models')