In [18]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import sys
import string
import nltk
import pandas as pd
import pickle

from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Custom modules 
from modules import preprocessing as pp
from modules import graph
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tjh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Note!

Not all websites us h tags, some, a lot actually href links! This is somethjing to consider when it comes to designeing the actual web evaluator.

In [19]:
stop_words = set(stopwords.words("english"))
# Extend stopwords (see analysis below)
extension = {
    'trumps',
    'trump',
    'obama',
    'donald',
    'new',
    'u',
    'tramp'
}
stop_words.update(extension)

In [75]:
# The first function just has to access a url and return a list of all headlines
url = 'https://www.upworthy.com/netflix-fan-shares-hack-code-finding-shows'
response = requests.get(url)

def get_headlines(response_text, tags=['h1', 'h2', 'h3', 'h4']):
    soup = BeautifulSoup(response_text, 'lxml')
    headers = soup.find_all(tags)
    return [header.text for header in headers]

def clean_headlines(title, length):
#     if len(title.split()) >= length:
#         return None
#     else:
        # strip newline characters
    title = title.replace("\n", "")
    title = title.replace("\t", "")
    title = pp.remove_non_ascii_chars(title)
    title = pp.lower_case(title)
    title = pp.remove_contractions(title)
    title = pp.lemmetise_series(title)
    title = "".join([char for char in title if char not in string.punctuation])
    # remove stopwords
    title = " ".join([char for char in tokenizer.tokenize(title) if char not in stop_words ])
    if len(title.split()) < length:
         return None

    return title
# Convert to ascii
# lower case 
# remove everything that is not printable.
    
def get_cleaned_headlines(url, length=3, tags=['h1', 'h2', 'h3']):
    text = requests.get(url).text
    return [clean_headlines(headline, length) for headline in get_headlines(text, tags=tags)]

def convert_list_to_X(cleaned_headlines, pipeline):
    # Convert list to a pandas sereios
    series = pd.Series(cleaned_headlines, name='title')
    X = pipeline.fit(X)
    return X


In [76]:
test = pd.Series(clean_headlines('clickbait is cancer', 2), name='title')

In [78]:
url = 'https://www.upworthy.com/netflix-fan-shares-hack-code-finding-shows'
cleaned_headlines = pd.Series(pp.get_cleaned_headlines(url), name='title')
series = cleaned_headlines.dropna()
series

0     netflix viewer cannot believe heard secret hac...
1     megan wa trafficked year fbi rescued found ref...
2     terminally ill washington elector openly wept ...
3     megan wa trafficked year fbi rescued found ref...
4     happens drinking 1 2 3 glass wine 19 viral pho...
5     christmas gargoyle spark epic decoration war n...
6     proud boy tore stomped set fire black churches...
7     author whose son died 14 year ago ha word hope...
8     woman found 4 yr olds fairy house spent 9 mont...
9     gift give back shop upworthiest place gift giving
10    uk change blood donation policy allowing gay b...
11    dr bidens response sexist op ed suggesting dro...
Name: title, dtype: object

In [79]:
f = open('./../../src/models/model1.1.pickle', 'rb')
clf = pickle.load(f)
f = open('./../../src/models/tfidf1.1.pickle', 'rb')
tfidf = pickle.load(f)

In [80]:
test = pd.Series(pp.clean_headlines('There’s A Sound That Apparently Only Teenagers Can Hear. Can You Hear It?', 2), name='title')

In [81]:
X = series

X_tfidf = tfidf.transform(X)
predictions = clf.predict(X_tfidf)


In [82]:
predictions.sum()/predictions.shape[0]

0.4166666666666667

In [83]:
target = pd.Series(predictions, name='target')

df = pd.DataFrame(list((zip(series, target))), columns=['title', 'target'])
df


Unnamed: 0,title,target
0,netflix viewer cannot believe heard secret hac...,1
1,megan wa trafficked year fbi rescued found ref...,0
2,terminally ill washington elector openly wept ...,0
3,megan wa trafficked year fbi rescued found ref...,0
4,happens drinking 1 2 3 glass wine 19 viral pho...,1
5,christmas gargoyle spark epic decoration war n...,0
6,proud boy tore stomped set fire black churches...,0
7,author whose son died 14 year ago ha word hope...,1
8,woman found 4 yr olds fairy house spent 9 mont...,1
9,gift give back shop upworthiest place gift giving,1


In [41]:
clean_headlines("Timothée Chalamet’s “SNL” Impression Of Harry Styles Is Doing Weird Things To Me", length=3)

'timothee chalamets snl impression harry style weird thing'

In [42]:
get_headlines(response.text)

['This browser is no longer supported.']

In [13]:
pp.remove_contractions(pp.lower_case(pp.remove_non_ascii_chars('Things From Amazon That’ll Make Perfect Gifts')))

'things from amazon that will make perfect gifts'

In [14]:
def print_headlines(response_text):
    soup = BeautifulSoup(response_text, 'lxml')
    headlines = soup.find_all(attrs={"itemprop": "headline"})
    for headline in headlines:
        print(headline.text)
        
def print_text(response_text):
    soup = BeautifulSoup(response_text, 'lxml')
    text = soup.find_all("h1")
    print(text)

In [15]:
url = 'https://www.cnn.com/2020/12/13/health/us-coronavirus-sunday/index.html'
response = requests.get(url)
print_headlines(response.text)




In [16]:
print_text(response.text)

[<h1 class="pg-headline">Covid-19 vaccine en route to every state as health officials say they hope immunizations begin Monday</h1>]


In [17]:
def replace(string, replacement_message, html):
    response = requests.get(html)
    soup = BeautifulSoup(response.text)
    nodes_to_censor = soup.findAll(text=re.compile(string))
    for node in nodes_to_censor:
        node.replaceWith(replacement_message)
        print(node)

__file__ = 'buffer.html'
base = os.path.dirname(os.path.abspath(__file__))
html = open(os.path.join(base, 'buffer.html'))
soup = BeautifulSoup(html, 'html.parser')

for i in soup.find('div', {"id":None}).findChildren():
    i.replace_with('##')

with open("example_modified.html", "wb") as f_output:
    f_output.write(soup.prettify("utf-8"))  

FileNotFoundError: [Errno 2] No such file or directory: '/Users/tjh/Flatiron/capstone/bait-n-switch/notebooks/EDA/buffer.html'

In [None]:
__file__ = 'buffer.html'
base = os.path.dirname(os.path.abspath(__file__))
base

In [None]:
os.path.abspath('models')