In [159]:
import json
import numpy as np
import pandas as pd

import os
import matplotlib.pyplot as plt

%matplotlib inline

In [160]:
filepath = '../../src/data/'

# Paths to the first data set. Can import immediately using pandas, 2016
dataset1_clickbait = '../../src/data/dataset1/clickbait_data'
dataset1_nonclickbait = '../../src/data/dataset1/non_clickbait_data'

# Paths to the second dataset, 2016
dataset2_json_files = '../../src/data/dataset2/webis-clickbait-16/problems/'
dataset2_label_csv = '../../src/data/dataset2/webis-clickbait-16/truth/majority.csv'

# Paths to the third dataset, 2017

dataset3_json = '../../src/data/dataset3/clickbait17-validation-170630/instances.jsonl'
dataset3_json_labels = '../../src/data/dataset3/clickbait17-validation-170630/truth.jsonl'

### Loading dataset 2

This data is also news headlines but this was not labelled using mechanical turks. 

In [161]:
# Load the clickbait data and make a dataframe
df1_clickbait = []
with open(dataset1_clickbait) as f:
    for line in f:
        df1_clickbait.append(line)

# Odd numbers are newlines, I'll quickly remove those from the data set
df1_clickbait = [(line.rstrip('\n').replace('\'', ''), 1) for line in df1_clickbait if line != '\n' ]
index = np.arange(0,len(df1_clickbait))
df1_click = pd.DataFrame(df1_clickbait, index=index, columns=['text', 'target'])

In [162]:
# Load the non_clickbait data and make dataframe
df1_nonclickbait = []
with open(dataset1_nonclickbait) as f:
    for line in f:
        df1_nonclickbait.append(line)
# filter out newline characters escape characters and append the target tag
df1_nonclickbait = [(line.rstrip("\n").replace("\"", ""), 0) for line in df1_nonclickbait if line != '\n']

index = np.arange(0,len(df1_nonclickbait))
df1_non = pd.DataFrame(df1_nonclickbait, index=index, columns=['text', 'target'])

In [163]:
# Concatenate the dataframes

df1 = pd.concat([df1_non, df1_click])
df1.to_csv(filepath + "dataframe1.csv", index=False)

### Loading dataset 2

Data set 2 comes from the wibst database from 2016. Manually labelled tweets containing the 

Data is stored in a series of files, each files name is the id of the tweets. There are only 3000 tweets. Each tweet needs to be cleaned and labelled.

In [44]:
dataset2_dir = os.listdir(dataset2_json_files)
print(len(dataset2_id_))

# load the data
d2_json_ = []
for dir_ in dataset2_dir:
    PATH = dataset2_json_files + dir_ + '/' + dir_ + ".json"
    with open(PATH) as fp:
        d2_json_.append(json.load(fp))

d2_labels_df = pd.read_csv(dataset2_label_csv, names=['id', 'target'])
d2_labels_df.head(3)

2992


Unnamed: 0,id,target
0,607668877594497024,clickbait
1,607671137062010881,no-clickbait
2,607672151638876160,no-clickbait


In [94]:
# Parse out the relevant information from the tweet.
tweets = []
for tweet in d2_json_:
    id_ = tweet['id']
    text = tweet['text']
    name = tweet['user']['name']
    
    tweets.append((id_, text, name, 2))

In [95]:
# Create a tweet dataframe prior to merging
index = np.arange(0, len(tweets))
datset2 = pd.DataFrame(tweets, index=index, columns=['id', 'post_text', 'name', 'dataset'])

In [96]:
# Merge the labels on the data frame
df2 = datset2.merge(d2_labels_df, on = 'id')

In [107]:
# relabel the 
true = df2.target.unique()[1]

def is_clickbait(string):
    if string == true:
        return 1
    else:
        return 0
    
df2.target = df2.target.apply(is_clickbait)
df2.to_csv(filepath+'dataframe2.csv', index=False)

### Loading dataset 3

Data set 3 comes from the wibst database from 2017. Manually labelled tweets and their news stories.

In [16]:
# Load dataset3
dataset3 = []
with open(dataset3_json) as f:
    for line in f:
        dataset3.append(json.loads(line))
        
dataset3_labels = []
with open(dataset3_json_labels) as f:
    for line in f:
        dataset3_labels.append(json.loads(line))

In [27]:
# id, test, title, title, timestamp.
dataset3[0]['id']
dataset3[0]['postText'],
dataset3[0]['targetTitle'],
dataset3[0]['postTimestamp']

'Sat Apr 29 23:25:41 +0000 2017'

In [84]:
# parse out the data and create a list of tuples
df3_tups = []

for json_ in dataset3:
    id_= json_['id']
    post_text = json_['postText'][0]
    title  = json_['targetTitle']
    timestamp = json_['postTimestamp']
    df3_tups.append((id_, post_text, title, timestamp , 3))

In [85]:
# Convert tuples to a dataframe
index = np.arange(0, len(df3_tups))
df3_X = pd.DataFrame(df3_tups, index = index, columns=['id', 'post_text', 'title', 'timestamp', 'dataset'])

In [86]:
# Parse out all of the label data
df3_y = []
for label in dataset3_labels:
    id_ = label['id']
    target = label['truthClass']
    df3_y.append((id_, target))
df3_y[0]

('858464162594172928', 'clickbait')

In [89]:
index = np.arange(0, len(df3_y))
df3_y = pd.DataFrame(df3_y, index = index, columns = ['id', 'target'])

In [91]:
df3 = df3_X.merge(df3_y, on='id')

In [112]:
# Classify target into clickbait or not
true = df3.target.unique()[1]
df3.target = df3.target.apply(is_clickbait)

# Save the dataframe as a csv
df3.to_csv(filepath + "dataframe3.csv", index=False)

# Sub dataframe creation is done

My next step is to start exploring the data in the next notebook which will be called `exploration`. I want to know if the data set's are similar enough to join together. 