In [1]:
import json
import numpy as np
import pandas as pd
import os

%matplotlib inline

In [2]:
filepath = '../../src/data/'

# Paths to the first data set. Can import immediately using pandas, 2016
dataset1_clickbait = '../../src/data/dataset1/clickbait_data'
dataset1_nonclickbait = '../../src/data/dataset1/non_clickbait_data'

# Paths to the second dataset, 2016
dataset2_json_files = '../../src/data/dataset2/webis-clickbait-16/problems/'
dataset2_label_csv = '../../src/data/dataset2/webis-clickbait-16/truth/majority.csv'

# Paths to the third dataset, 2017

dataset3_json = '../../src/data/dataset3/clickbait17-validation-170630/instances.jsonl'
dataset3_json_labels = '../../src/data/dataset3/clickbait17-validation-170630/truth.jsonl'

### Loading dataset 1

This data is also news headlines but this was not labelled using mechanical turks. I imported the lines form the text file, stripped the new line characters and then made sure there were no escape characters "\" in the line. I then saved the data as a list of tuples that with the target tag. once I had the the tuple list it was light work getting a data frame.

In [3]:
# Load the clickbait data and make a dataframe
df1_clickbait = []
with open(dataset1_clickbait) as f:
    for line in f:
        df1_clickbait.append(line)

# Odd numbers are newlines, I'll quickly remove those from the data set
df1_clickbait = [(line.rstrip('\n').replace('\'', ''), 1) for line in df1_clickbait if line != '\n' ]
index = np.arange(0,len(df1_clickbait))
df1_click = pd.DataFrame(df1_clickbait, index=index, columns=['text', 'target'])

In [5]:
df1_click.head()

Unnamed: 0,text,target
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [7]:
# Load the non_clickbait data and make dataframe
df1_nonclickbait = []
with open(dataset1_nonclickbait) as f:
    for line in f:
        df1_nonclickbait.append(line)
# filter out newline characters escape characters and append the target tag
df1_nonclickbait = [(line.rstrip("\n").replace("\"", ""), 0) for line in df1_nonclickbait if line != '\n']

index = np.arange(0,len(df1_nonclickbait))
df1_non = pd.DataFrame(df1_nonclickbait, index=index, columns=['text', 'target'])
df1_nonclickbait

[('Bill Changing Credit Card Rules Is Sent to Obama With Gun Measure Included',
  0),
 ('In Hollywood, the Easy-Money Generation Toughens Up', 0),
 ("1700 runners still unaccounted for in UK's Lake District following flood",
  0),
 ('Yankees Pitchers Trade Fielding Drills for Putting Practice', 0),
 ('Large earthquake rattles Indonesia; Seventh in two days', 0),
 ("Coldplay's new album hits stores worldwide this week", 0),
 ('U.N. Leader Presses Sri Lanka on Speeding Relief to War Refugees in Camps',
  0),
 ('2 Somali-Americans Charged With Aiding Terror', 0),
 ("US Highway Administration releases interim report on Boston's Big Dig: press release claims tunnel safe, but report does not",
  0),
 ('White House Announces International Meetings to Address Energy and Climate Issues',
  0),
 ('With Troubled Coyotes, Gretzky Called On as Savior Again', 0),
 ('Door opens mid-Qantas flight; plane makes an emergency landing', 0),
 ('Gas prices surge in Northeast US', 0),
 ('Schapelle Corby found

In [5]:
# Concatenate the dataframes

df1 = pd.concat([df1_non, df1_click])
df1.to_csv(filepath + "dataframe1.csv", index=False)

### Loading dataset 2

Data set 2 comes from the wibst database from 2016. Manually labelled tweets containing the 

Data is stored in a series of files, each files name is the id of the tweets. There are only 3000 tweets. Each tweet needs to be cleaned and labelled.

In [6]:
dataset2_dir = os.listdir(dataset2_json_files)

# load the data
d2_json_ = []
for dir_ in dataset2_dir:
    PATH = dataset2_json_files + dir_ + '/' + dir_ + ".json"
    with open(PATH) as fp:
        d2_json_.append(json.load(fp))

d2_labels_df = pd.read_csv(dataset2_label_csv, names=['id', 'target'])
d2_labels_df.head(3)

Unnamed: 0,id,target
0,607668877594497024,clickbait
1,607671137062010881,no-clickbait
2,607672151638876160,no-clickbait


In [7]:
d2_labels_df.groupby(['target']).target.count()

target
clickbait        767
no-clickbait    2225
Name: target, dtype: int64

In [8]:
d2_json_[0]

{'extended_entities': {'media': [{'display_url': 'pic.twitter.com/esXFYDiAa3',
    'source_user_id': 1613648400,
    'type': 'photo',
    'media_url': 'http://pbs.twimg.com/media/CHUEawEUEAAjCsE.jpg',
    'source_status_id': 609398183278563328,
    'url': 'http://t.co/esXFYDiAa3',
    'indices': [105, 127],
    'sizes': {'small': {'w': 340, 'h': 251, 'resize': 'fit'},
     'large': {'w': 625, 'h': 463, 'resize': 'fit'},
     'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'medium': {'w': 600, 'h': 444, 'resize': 'fit'}},
    'id_str': '609398182204674048',
    'expanded_url': 'http://twitter.com/BuzzFeedBooks/status/609398183278563328/photo/1',
    'source_status_id_str': '609398183278563328',
    'media_url_https': 'https://pbs.twimg.com/media/CHUEawEUEAAjCsE.jpg',
    'id': 609398182204674048,
    'source_user_id_str': '1613648400'}]},
 'in_reply_to_status_id_str': None,
 'in_reply_to_status_id': None,
 'created_at': 'Fri Jun 12 16:34:11 +0000 2015',
 'in_reply_to_user_id_str':

In [9]:
# Parse out the relevant information from the tweet.
tweets = []
for tweet in d2_json_:
    id_ = tweet['id']
    text = tweet['text']
    name = tweet['user']['name']
    
    tweets.append((id_, text, name, 2))

In [10]:
# Create a tweet dataframe prior to merging
index = np.arange(0, len(tweets))
datset2 = pd.DataFrame(tweets, index=index, columns=['id', 'post_text', 'name', 'dataset'])

In [11]:
# Merge the labels on the data frame
df2 = datset2.merge(d2_labels_df, on = 'id')
df2.head()

Unnamed: 0,id,post_text,name,dataset,target
0,609398299926339584,RT @BuzzFeedBooks: John Green responded on Tum...,BuzzFeed,2,no-clickbait
1,609858047670923265,ISIS intercepts thousands of Syrian refugees t...,Daily Mail Online,2,no-clickbait
2,607934124813205505,RT @foxnewslatino: Carlos Santana (and his wif...,Fox News,2,no-clickbait
3,609361820944605185,Female scientists fire back at Nobel laureate’...,Yahoo,2,no-clickbait
4,610056503626698752,RT @davidshukmanbbc: Amazing news from deep sp...,BBC News (UK),2,no-clickbait


In [12]:
# relabel the 
true = df2.target.unique()[1]
print(true)
def is_clickbait(string):
    if string == true:
        return 1
    else:
        return 0
    
df2.target = df2.target.apply(is_clickbait)
df2.to_csv(filepath+'dataframe2.csv', index=False)

clickbait


### Loading dataset 3

Data set 3 comes from the wibst database from 2017. Manually labelled tweets and their news stories.

In [13]:
# Load dataset3
dataset3 = []
with open(dataset3_json) as f:
    for line in f:
        dataset3.append(json.loads(line))
        
dataset3_labels = []
with open(dataset3_json_labels) as f:
    for line in f:
        dataset3_labels.append(json.loads(line))

In [14]:
# id, test, title, title, timestamp.
dataset3[0]['id']
dataset3[0]['postText'],
dataset3[0]['targetTitle'],
dataset3[0]['postTimestamp']

'Sat Apr 29 23:25:41 +0000 2017'

In [15]:
dataset3_labels[8]

{'truthJudgments': [0.6666666666, 0.6666666666, 1.0, 0.6666666666, 0.0],
 'truthMean': 0.59999999996,
 'id': '858444379232624641',
 'truthClass': 'clickbait',
 'truthMedian': 0.6666666666,
 'truthMode': 0.6666666666}

In [16]:
# parse out the data and create a list of tuples
df3_tups = []

for json_ in dataset3:
    id_= json_['id']
    post_text = json_['postText'][0]
    title  = json_['targetTitle']
    timestamp = json_['postTimestamp']
    df3_tups.append((id_, post_text, title, timestamp , 3))

In [17]:
# Convert tuples to a dataframe
index = np.arange(0, len(df3_tups))
df3_X = pd.DataFrame(df3_tups, index = index, columns=['id', 'post_text', 'title', 'timestamp', 'dataset'])

In [18]:
# Parse out all of the label data
df3_y = []
for label in dataset3_labels:
    id_ = label['id']
    target = label['truthClass']
    df3_y.append((id_, target))
df3_y[0]

('858464162594172928', 'clickbait')

In [19]:
index = np.arange(0, len(df3_y))
df3_y = pd.DataFrame(df3_y, index = index, columns = ['id', 'target'])

In [20]:
df3 = df3_X.merge(df3_y, on='id')
df3.head()

Unnamed: 0,id,post_text,title,timestamp,dataset,target
0,858462320779026433,UK’s response to modern slavery leaving victim...,‘Inexcusable’ failures in UK’s response to mod...,Sat Apr 29 23:25:41 +0000 2017,3,no-clickbait
1,858421020331560960,this is good,Donald Trump Appoints Pro-Life Advocate as Ass...,Sat Apr 29 20:41:34 +0000 2017,3,clickbait
2,858368123753435136,"The ""forgotten"" Trump roast: Relive his brutal...",The ‘forgotten’ Trump roast: Relive his brutal...,Sat Apr 29 17:11:23 +0000 2017,3,no-clickbait
3,858323428260139008,Meet the happiest #dog in the world!,"Meet The Happiest Dog In The World, Maru The H...",Sat Apr 29 14:13:46 +0000 2017,3,clickbait
4,858283602626347008,Tokyo's subway is shut down amid fears over an...,Tokyo's subway is shut down amid fears over an...,Sat Apr 29 11:35:31 +0000 2017,3,no-clickbait


In [21]:
# Classify target into clickbait or not
true = df3.target.unique()[1]
df3.target = df3.target.apply(is_clickbait)

# Save the dataframe as a csv
df3.to_csv(filepath + "dataframe3.csv", index=False)

# Sub dataframe creation is done

My next step is to start exploring the data in the next notebook which will be called `exploration`. I want to know if the data set's are similar enough to join together. 

In [22]:
true

'clickbait'