In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import sys
import string
import nltk
import pandas as pd
import pickle

from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Custom modules 
from modules import preprocessing as pp
from modules import graph
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tjh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
stop_words = set(stopwords.words("english"))
# Extend stopwords (see analysis below)
extension = {
    'trumps',
    'trump',
    'obama',
    'donald',
    'new',
    'u',
    'tramp'
}
stop_words.update(extension)

In [3]:
# The first function just has to access a url and return a list of all headlines
url = 'https://twitter.com/home'
response = requests.get(url)

def get_headlines(response_text, tags=['h1', 'h2', 'h3', 'h4']):
    soup = BeautifulSoup(response_text, 'lxml')
    headers = soup.find_all(tags)
    return [header.text for header in headers]

def clean_headlines(title, length):
#     if len(title.split()) >= length:
#         return None
#     else:
        # strip newline characters
    title = title.replace("\n", "")
    title = title.replace("\t", "")
    title = pp.remove_non_ascii_chars(title)
    title = pp.lower_case(title)
    title = pp.remove_contractions(title)
    title = pp.lemmetise_series(title)
    title = "".join([char for char in title if char not in string.punctuation])
    # remove stopwords
    title = " ".join([char for char in tokenizer.tokenize(title) if char not in stop_words ])
    if len(title.split()) < length:
         return None

    return title
# Convert to ascii
# lower case 
# remove everything that is not printable.
    
def get_cleaned_headlines(url, length=3, tags=['h1', 'h2', 'h3']):
    text = requests.get(url).text
    return [clean_headlines(headline, length) for headline in get_headlines(text, tags=tags)]

def convert_list_to_X(cleaned_headlines, pipeline):
    # Convert list to a pandas sereios
    series = pd.Series(cleaned_headlines, name='title')
    X = pipeline.fit(X)
    return X


In [4]:
test = pd.Series(clean_headlines('clickbait is cancer', 2), name='title')

In [5]:
url = 'https://www.nytimes.com/'
cleaned_headlines = pd.Series(pp.get_cleaned_headlines(url), name='title')
series = cleaned_headlines.dropna()
series

0                            wednesday evening briefing
1                                  listen american life
2                                   modern love podcast
3     staring deadline congress nears 900 billion st...
4     hospital discover surprise vaccine deliveries ...
6     pandemic yet official stress us vaccination begin
7     vaccination campaign nursing home face obstacl...
8                  effective mask wearing may know soon
9     health worker alaska serious allergic reaction...
11                            case per caput arehighest
12                                          us hot spot
15                           trackerschoose place track
17                                          us hot spot
22                                          us hot spot
27    billion spent us defense failed detect giant r...
28           bidens inaugural mostly virtual money real
29    biden introduces buttigieg pick lead transport...
30    president elect joe biden expected name br

In [6]:
f = open('./../../src/models/model1.1.pickle', 'rb')
clf = pickle.load(f)
f = open('./../../src/models/tfidf1.1.pickle', 'rb')
tfidf = pickle.load(f)

In [7]:
test = pd.Series(pp.clean_headlines('There’s A Sound That Apparently Only Teenagers Can Hear. Can You Hear It?', 2), name='title')

In [8]:
X = test

X_tfidf = tfidf.transform(X)
predictions = clf.predict(X_tfidf)


In [9]:
predictions.sum()/predictions.shape[0]

0.0

In [10]:
target = pd.Series(predictions, name='target')

df = pd.DataFrame(list((zip(series, target))), columns=['title', 'target'])
df


Unnamed: 0,title,target
0,wednesday evening briefing,0


In [11]:
clean_headlines("Timothée Chalamet’s “SNL” Impression Of Harry Styles Is Doing Weird Things To Me", length=3)

'timothee chalamets snl impression harry style weird thing'

In [12]:
get_headlines(response.text)

['This browser is no longer supported.']

In [13]:
pp.remove_contractions(pp.lower_case(pp.remove_non_ascii_chars('Things From Amazon That’ll Make Perfect Gifts')))

'things from amazon that will make perfect gifts'

In [14]:
def print_headlines(response_text):
    soup = BeautifulSoup(response_text, 'lxml')
    headlines = soup.find_all(attrs={"itemprop": "headline"})
    for headline in headlines:
        print(headline.text)
        
def print_text(response_text):
    soup = BeautifulSoup(response_text, 'lxml')
    text = soup.find_all("h1")
    print(text)

In [15]:
url = 'https://www.cnn.com/2020/12/13/health/us-coronavirus-sunday/index.html'
response = requests.get(url)
print_headlines(response.text)




In [16]:
print_text(response.text)

[<h1 class="pg-headline">Covid-19 vaccine en route to every state as health officials say they hope immunizations begin Monday</h1>]


In [17]:
def replace(string, replacement_message, html):
    response = requests.get(html)
    soup = BeautifulSoup(response.text)
    nodes_to_censor = soup.findAll(text=re.compile(string))
    for node in nodes_to_censor:
        node.replaceWith(replacement_message)
        print(node)

__file__ = 'buffer.html'
base = os.path.dirname(os.path.abspath(__file__))
html = open(os.path.join(base, 'buffer.html'))
soup = BeautifulSoup(html, 'html.parser')

for i in soup.find('div', {"id":None}).findChildren():
    i.replace_with('##')

with open("example_modified.html", "wb") as f_output:
    f_output.write(soup.prettify("utf-8"))  

FileNotFoundError: [Errno 2] No such file or directory: '/Users/tjh/Flatiron/capstone/bait-n-switch/notebooks/EDA/buffer.html'

In [None]:
__file__ = 'buffer.html'
base = os.path.dirname(os.path.abspath(__file__))
base

In [None]:
os.path.abspath('models')