# Download Data

In [2]:
import pandas as pd
import urllib
import subprocess

urllib.request.urlretrieve("https://github.com/bhargaviparanjape/clickbait/raw/master/dataset/clickbait_data.gz", "clickbait_data.gz")
process = subprocess.Popen("gunzip clickbait_data.gz".split(), stdout=subprocess.PIPE)
output, error = process.communicate()
clickbait_titles = pd.read_csv("clickbait_data", sep="\n", header=None)
print(clickbait_titles[0:5])

urllib.request.urlretrieve("https://github.com/bhargaviparanjape/clickbait/raw/master/dataset/non_clickbait_data.gz", "non_clickbait_data.gz")
process = subprocess.Popen("gunzip non_clickbait_data.gz".split(), stdout=subprocess.PIPE)
output, error = process.communicate()
non_clickbait_titles = pd.read_csv("non_clickbait_data", sep="\n", header=None)
print(non_clickbait_titles[0:5])

all_links = pd.read_json("http://fake-news-detector-api.herokuapp.com/links/all")
all_links[0:5]

                                                   0
0                                 Should I Get Bings
1      Which TV Female Friend Group Do You Belong In
2  The New "Star Wars: The Force Awakens" Trailer...
3  This Vine Of New York On "Celebrity Big Brothe...
4  A Couple Did A Stunning Photo Shoot With Their...
                                                   0
0  Bill Changing Credit Card Rules Is Sent to Oba...
1  In Hollywood, the Easy-Money Generation Toughe...
2  1700 runners still unaccounted for in UK's Lak...
3  Yankees Pitchers Trade Fielding Drills for Put...
4  Large earthquake rattles Indonesia; Seventh in...


Unnamed: 0,category_id,clickbait_title,content,count,id,title,url,verified_category_id,verified_clickbait_title
0,1,,"Last month, a white woman in Oakland called th...",1,2083,WATCH: Oakland locals swarm park where racist ...,https://www.rawstory.com/2018/05/watch-oakland...,,
1,2,1.0,Update: Clifton Hill just announced their pric...,1,2082,Niagara Falls' real-life Mario Kart track fina...,http://dailyhive.com/toronto/niagara-falls-rea...,,
2,1,,Note: This page is a reproduction of the Hilla...,1,2081,Poverty - The Office of Hillary Rodham Clinton,https://www.hillaryclinton.com/issues/poverty/,,
3,1,0.0,As mães que tiveram seus filhos assassinados p...,1,2080,As mães que tiveram seus filhos assassinados p...,https://theintercept.com/2018/05/13/maes-com-f...,,
4,1,0.0,No filme“Polícia Federal — A lei é para todos”...,1,2079,“Polícia Federal — A lei é para todos”. Só que...,https://interc.pt/2IEuLZQ,,


# Build DataFrame

In [117]:
from sklearn.model_selection import train_test_split
import numpy as np

df = all_links.copy()

df['title'].replace('', np.nan, inplace=True)
df.dropna(subset=["title"], inplace=True)
df = df.loc[df['title'].str.len() > 30]

df["clickbait_title"] = df['verified_clickbait_title'].fillna(df['clickbait_title'])

df["is_clickbait"] = [ 0 if c == 0 else 1 if c == 1 else 0.5 for c in df['clickbait_title'] ]

df = df[["title", "is_clickbait"]]

print("Number of click bait samples", len(df[df["is_clickbait"] == 1]))

df

Number of click bait samples 143


Unnamed: 0,title,is_clickbait
0,WATCH: Oakland locals swarm park where racist ...,0.5
1,Niagara Falls' real-life Mario Kart track fina...,1.0
2,Poverty - The Office of Hillary Rodham Clinton,0.5
3,As mães que tiveram seus filhos assassinados p...,0.0
4,“Polícia Federal — A lei é para todos”. Só que...,0.0
5,Wolf in Sheep’s Clothing (or a Scientist’s Lab...,0.0
6,"globo.com - Absolutamente tudo sobre notícias,...",0.0
7,White people keep calling the cops on black pe...,0.5
8,"Donald Trump, Bernie Sanders, and Jill Stein a...",0.5
9,John McCain: ‘Vladimir Putin Is an Evil Man’,0.0


# Default classification approach, ignoring "I don't know"

In [118]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.dummy import DummyClassifier

positive_df = df[df["is_clickbait"] == 1]
negative_df = df[df["is_clickbait"] == 0].apply(np.random.permutation)[0:len(positive_df)]
balanced_df = positive_df.append(negative_df)
balanced_df = balanced_df.reindex(np.random.permutation(balanced_df.index))

X = balanced_df
y = balanced_df["is_clickbait"]
y = [ False if yi == 0 else True for yi in y ]

pipeline = Pipeline([
    ('selector', FunctionTransformer(lambda x: x['title'], validate=False)),
    ('tfidf', TfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3), max_df=0.5, min_df=2, token_pattern='[A-Za-z0-9]+')),
    ('clf', MultinomialNB()),
])

clf = GridSearchCV(pipeline, verbose=1, scoring='f1', param_grid={
    'clf': [
        MultinomialNB(),
        RandomForestClassifier(),
        DummyClassifier("constant", constant=1.0)
    ],
    'tfidf__max_df': [0.5, 0.1],
    'tfidf__min_df': [1, 2],
    'tfidf__token_pattern': ['[A-Za-z0-9]+', r"(?u)\b\w\w+\b"]
})
clf = clf.fit(X, y)

results = pd.DataFrame(clf.cv_results_)
results['clf'] = [ p['clf'] for p in results['params'] ]
results['tfidf__max_df'] = [ p['tfidf__max_df'] for p in results['params'] ]
results['tfidf__min_df'] = [ p['tfidf__min_df'] for p in results['params'] ]
results['tfidf__token_pattern'] = [ p['tfidf__token_pattern'] for p in results['params'] ]
results[['clf', 'tfidf__max_df', 'tfidf__min_df', 'tfidf__token_pattern', 'mean_test_score']]

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    8.7s finished


Unnamed: 0,clf,tfidf__max_df,tfidf__min_df,tfidf__token_pattern,mean_test_score
0,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.5,1,[A-Za-z0-9]+,0.656321
1,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.5,1,(?u)\b\w\w+\b,0.669863
2,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.5,2,[A-Za-z0-9]+,0.607994
3,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.5,2,(?u)\b\w\w+\b,0.600811
4,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.1,1,[A-Za-z0-9]+,0.636817
5,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.1,1,(?u)\b\w\w+\b,0.629193
6,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.1,2,[A-Za-z0-9]+,0.563861
7,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.1,2,(?u)\b\w\w+\b,0.550628
8,"(DecisionTreeClassifier(class_weight=None, cri...",0.5,1,[A-Za-z0-9]+,0.330697
9,"(DecisionTreeClassifier(class_weight=None, cri...",0.5,1,(?u)\b\w\w+\b,0.371471


# Regressor Appoach, taking "I don't know" into account

In [77]:
positive_df = df[df["is_clickbait"] == 1]
negative_df = df[df["is_clickbait"] == 0].apply(np.random.permutation)[0:len(positive_df)]
idk_df = df[(df["is_clickbait"] != 0) & (df["is_clickbait"] != 1)].apply(np.random.permutation)[0:len(positive_df)]
balanced_df = positive_df.append(negative_df).append(idk_df)
balanced_df = balanced_df.reindex(np.random.permutation(balanced_df.index))

X = balanced_df
y = balanced_df["is_clickbait"]

balanced_df

Unnamed: 0,title,is_clickbait
80,Har du Rh-negativt blod? Du kan vara en utomjo...,1.0
1939,QUASE MEIO BILHÃO DE EUROS!,1.0
240,Being Open and Connected on Your Own Terms wit...,0.0
377,Google já fornece internet por balões para 100...,0.5
71,"Hey Karen, do you remember when we were front ...",0.0
25,Uerrr,0.0
58,La langue de Marchand | Yves Boisvert | Yves B...,1.0
43,REPORT: Malcolm Butler Didn't Play Because He ...,0.5
2,See the Earth from 20 miles (32 km) above in t...,0.5
201,"PANICĂ în CAPITALĂ. Un șofer TERIBILIST, oprit...",1.0


In [80]:
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import linear_model

class ModelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))

class RoundTransformer(BaseEstimator):
    def __init__(self, limit=0.5):
        self.limit = limit
        
    def fit(self, *args, **kwargs):
        return self
        
    def predict(self, X):
        return [ 1.0 if x >= self.limit else 0.0 for x in X[0] ]
    
    def score(self, X, y):
        # Ignore the "i don't know" click bait titles for scoring later,
        # because even if the humans are not sure, it is not a problem for
        # the machine to be wrong
        y = pd.Series(y).reset_index(drop=True)
        indexes = y.index[(y != 0.5)].tolist()
        X_test = X.loc[indexes]
        y_test = y.loc[indexes]

        if len(y_test) == 0:
            return 0
        
        score = accuracy_score(self.predict(X_test), y_test)
        return score
    
pipeline = Pipeline([
    ('selector', FunctionTransformer(lambda x: x['title'], validate=False)),
    ('tfidf', TfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3), max_df=0.5, min_df=2)),
    ('clf', ModelTransformer(RandomForestRegressor())),
    ('round', RoundTransformer(limit=0.5))
])

clf = GridSearchCV(pipeline, verbose=1, scoring='f1', param_grid={
    'round__limit': [0.5, 0.7, 0.3],
    'clf': [ModelTransformer(DummyClassifier("constant", constant=1.0)),
            ModelTransformer(RandomForestRegressor()),
            ModelTransformer(linear_model.LinearRegression()),
            ModelTransformer(linear_model.Ridge()),
            ModelTransformer(linear_model.ElasticNet()),
#             ModelTransformer(linear_model.LassoLars()),
#             ModelTransformer(linear_model.OrthogonalMatchingPursuit()),
#             ModelTransformer(linear_model.BayesianRidge()),
#             ModelTransformer(linear_model.ARDRegression()),
#             ModelTransformer(linear_model.LogisticRegression()),
#             ModelTransformer(linear_model.SGDRegressor()),
            ModelTransformer(linear_model.PassiveAggressiveRegressor()),
#             ModelTransformer(linear_model.TheilSenRegressor()),
            ModelTransformer(linear_model.HuberRegressor()),
#             ModelTransformer(linear_model.RANSACRegressor()),
            ModelTransformer(linear_model.Lasso())
           ]
})
clf = clf.fit(X, y)

results = pd.DataFrame(clf.cv_results_)
results['clf'] = [ p['clf'] for p in results['params'] ]
results['round__limit'] = [ p['round__limit'] for p in results['params'] ]
results[['clf', 'round__limit', 'mean_test_score']]

Fitting 3 folds for each of 24 candidates, totalling 72 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    9.6s finished


Unnamed: 0,clf,round__limit,mean_test_score
0,ModelTransformer(model=DummyClassifier(constan...,0.5,0.666095
1,ModelTransformer(model=DummyClassifier(constan...,0.7,0.666095
2,ModelTransformer(model=DummyClassifier(constan...,0.3,0.666095
3,ModelTransformer(model=RandomForestRegressor(b...,0.5,0.59109
4,ModelTransformer(model=RandomForestRegressor(b...,0.7,0.515931
5,ModelTransformer(model=RandomForestRegressor(b...,0.3,0.668684
6,ModelTransformer(model=LinearRegression(copy_X...,0.5,0.631445
7,ModelTransformer(model=LinearRegression(copy_X...,0.7,0.589927
8,ModelTransformer(model=LinearRegression(copy_X...,0.3,0.615012
9,"ModelTransformer(model=Ridge(alpha=1.0, copy_X...",0.5,0.665739


# Train on another dataset and score against ours

In [128]:
df2 = pd.DataFrame({
        "title": clickbait_titles[0],
        "is_clickbait": [1] * len(clickbait_titles[0])
    })
df2 = df2.append(pd.DataFrame({
        "title": non_clickbait_titles[0],
        "is_clickbait": [0] * len(non_clickbait_titles[0])
     }), ignore_index=True)
df2 = df2.reindex(np.random.permutation(df2.index))

X_train = df2[["title"]]
y_train = df2["is_clickbait"]

df3 = df[(df["is_clickbait"] == 1) | (df["is_clickbait"] == 0)]
df3 = df3.reindex(np.random.permutation(df3.index))

X_test = df3[["title"]]
y_test = df3["is_clickbait"]

pipeline = Pipeline([
    ('selector', FunctionTransformer(lambda x: x['title'], validate=False)),
    ('tfidf', TfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3), max_df=0.5, min_df=5, token_pattern='[A-Za-z0-9]+')),
    ('clf', MultinomialNB()),
])

clf = pipeline.fit(X_train, y_train)

clf.score(X_test, y_test)

0.54166666666666663