In [1]:
import nltk 
from nltk.corpus import stopwords
import json
import arff # https://pypi.python.org/pypi/liac-arff
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [2]:
#object that holds a lot of our data

class NewsObject:
    id = '0'
    targetTitle = []
    targetDescription = ""
    targetKeywords = []
    targetParagraphs = []
    targetCaptions = []
    postText = []
    postMedia = []
    postTimestamp = ''
    #truthMedian = ''
    #truthMean = ''
    #truthMode = ''
    truthClass = ""
    #truthJudgments = []
    attributes = ()
    
    def __init__(self, line):
        
        self.id = line['id']
        self.targetTitle= line['targetTitle']
        self.targetKeywords = line['targetKeywords']
        self.targetParagraphs = line['targetParagraphs']
        self.targetCaptions = line['targetCaptions']
        self.postText = line['postText']
        self.postMedia = line['postMedia']
        self.postTimestamp = line['postTimestamp']
        
    def addTruth(self, line):
        #self.truthMedian = line['truthMedian']
        #self.truthMean = line['truthMean']
        #self.truthMode = line['truthMode']
        if line['truthClass'] == 'clickbait':
            self.truthClass = '1'
        else:
            self.truthClass = '0'
        #self.truthJudgments = line['truthJudgments']
        
def extractFeatures(newsObject):
    f1 = 0
    f2 = False
    f3 = False
    f4 = False
    f5 = False
    f6 = 0
    f7 = 0
    f8 = 0
    f9 = 0
    f10 = 0
    f11 = 0
    f12 = 0
    f100 = newsObject.truthClass

    words = newsObject.postText[0].split(' ')
    f1 = len(words)

    informal = ["you", "you're", "we", "our", "your", "my"]
    questionWords = ['who', 'what', 'where', 'how', 'why']
    stopWords = set(stopwords.words('english'))

    if words[0].isdigit():
        f4 = True
    if words[0].lower() in questionWords:
        f3 = True
    for word in words:
        if word.lower() in informal:
            f2 = True
        if word.lower() == 'this':
            f5 = True

    text = nltk.word_tokenize(newsObject.postText[0])
    tokenizedList = nltk.pos_tag(text)

    numProper = 0
    numStop = 0
    for partOfSpeech in tokenizedList:
        if partOfSpeech[1] == 'NNP':
            numProper += 1
        elif(partOfSpeech[0] in stopWords):
            numStop += 1

    f7 = round(numStop/f1, 2)
    f8 = round(numProper/f1, 2)

    sentence = newsObject.postText[0]
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    f6 = ss['pos']
    f9 = ss['neu']
    f10 = ss['neg']
    f11 = ss['compound']
    
    for item in newsObject.targetParagraphs:
        f12 += len(item.split())

    feat = (f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f100)
    newsObject.attributes = feat
    return newsObject
            

In [3]:
#import files
instances = []

with open('dataset/instances_train.jsonl') as file:
    for line in file:
        temp = NewsObject(json.loads(line))
        instances.append(temp)
        
with open('dataset/truth_train.jsonl') as file2:
    i = 0
    for line in file2:
        instances[i].addTruth(json.loads(line))
        i += 1
#print(instances[0].targetKeywords)

In [4]:
for item in instances:
    item = extractFeatures(item)

text = nltk.word_tokenize(instances[0].postText[0])
tokenizedList = nltk.pos_tag(text)
sentence = instances[0].postText[0]
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(sentence)
print(sentence)
print(ss)

Apple's iOS 9 'App thinning' feature will give your phone's storage a boost
{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'compound': 0.4019}


In [5]:
text = nltk.word_tokenize(instances[0].postText[0])
tokenizedList = nltk.pos_tag(text)
print(tokenizedList)
sentence = instances[0].postText[0]
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(sentence)
print(sentence)
print(ss)

[('Apple', 'NNP'), ("'s", 'POS'), ('iOS', 'NN'), ('9', 'CD'), ("'App", 'CD'), ('thinning', 'VBG'), ("'", "''"), ('feature', 'NN'), ('will', 'MD'), ('give', 'VB'), ('your', 'PRP$'), ('phone', 'NN'), ("'s", 'POS'), ('storage', 'NN'), ('a', 'DT'), ('boost', 'NN')]
Apple's iOS 9 'App thinning' feature will give your phone's storage a boost
{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'compound': 0.4019}


In [6]:
### dump to arff
features = [("word count", 'NUMERIC'),
            ("contains informal pronouns", ['True', 'False']),
            ("Begins w/ question word", ['True', 'False']),
            ("Begins w/ number", ['True', 'False']),
            ("Contains 'this'", ['True', 'False']),
            ("percent stop words", 'NUMERIC'),
            ("Percent proper nouns", 'NUMERIC'),
            ("Pos sent", 'NUMERIC'),
            ("Neu sent", 'NUMERIC'),
            ("Neg sent", 'NUMERIC'),
            ("Compound sent", 'NUMERIC'),
            ("Article Length", 'NUMERIC'),
            ("label", ['0', '1'])]
data = {}
data.setdefault('attributes', features)
data.setdefault('description', '')
data.setdefault('relation', 'clickbait_sample')
data.setdefault('data', [])
for item in instances:
    if(item.truthClass == '1'):
        print(item.postText[0])
        print(item.attributes)
    data['data'].append(item.attributes)

with open('sample_train.arff', 'w') as f:
    f.write(arff.dumps(data))

U.S. Soccer should start answering tough questions about Hope Solo, @eric_adelson writes.
(12, False, False, False, False, 0.201, 0.17, 0.42, 0.694, 0.104, 0.34, 695, '1')
(16, True, False, False, False, 0.0, 0.31, 0.0, 0.707, 0.293, -0.7003, 975, '1')
13 classic ’00s songs that were actually meant for other artists
(11, False, False, True, False, 0.0, 0.36, 0.0, 1.0, 0.0, 0.0, 651, '1')
Tourists detained in Malaysia for getting naked on sacred mountain:
(10, False, False, False, False, 0.0, 0.3, 0.1, 0.769, 0.231, -0.4019, 543, '1')
The brutal dictatorship the world keeps ignoring
(7, False, False, False, False, 0.0, 0.14, 0.0, 0.424, 0.576, -0.7783, 1197, '1')
RT @BBCWalesNews: Caerphilly farmer may get payout worth hundreds of thousands after 24 year wait
(15, False, False, False, False, 0.119, 0.13, 0.2, 0.881, 0.0, 0.2263, 272, '1')
Man dies when car plunges from parking garage
(8, False, False, False, False, 0.0, 0.25, 0.0, 1.0, 0.0, 0.0, 89, '1')
5 inconsistencies in 'Jurassic W

(19, False, False, False, True, 0.086, 0.53, 0.0, 0.914, 0.0, 0.1513, 585, '1')
Remy Ma on "shETHER": "I'm just not particularly proud of it."
(11, False, False, False, False, 0.0, 0.45, 0.18, 0.783, 0.217, -0.4158, 380, '1')
Alex Rodriguez officially joins Fox Sports as a full-time MLB analyst
(11, False, False, False, False, 0.0, 0.18, 0.45, 1.0, 0.0, 0.0, 342, '1')
Patrick Leonard, condemned Ohio man, kills himself on death row:
(10, False, False, False, False, 0.0, 0.2, 0.3, 0.405, 0.595, -0.8834, 133, '1')
Amber Tamblyn sums up the messiness of breastfeeding in one photo
(11, False, False, False, False, 0.0, 0.36, 0.18, 1.0, 0.0, 0.0, 197, '1')
Shohei Ohtani could have been a $200-million man in MLB...but then the rules changed
(14, False, False, False, False, 0.0, 0.5, 0.21, 1.0, 0.0, 0.0, 3966, '1')
Uh oh.
(2, False, False, False, False, 0.0, 0.0, 0.5, 1.0, 0.0, 0.0, 375, '1')
Breaking News: President Trump issued a more limited executive order on immigration
(12, False, False, 

(7, False, False, False, False, 0.206, 0.29, 0.0, 0.794, 0.0, 0.0772, 334, '1')
Here's your Super Bowl cheat sheet
(6, True, False, False, False, 0.358, 0.17, 0.33, 0.367, 0.275, 0.2263, 846, '1')
Chris Christie defends Trump, who he says believes "America is morally superior to Russia"
(14, False, False, False, False, 0.212, 0.29, 0.29, 0.788, 0.0, 0.5423, 409, '1')
The NFL's MVP wasn't the only award Falcons QB Matt Ryan walked away with last night.
(16, False, False, False, False, 0.0, 0.25, 0.38, 0.84, 0.16, -0.431, 723, '1')
These are the most polluted cities in the world
(9, False, False, False, False, 0.0, 0.56, 0.0, 0.708, 0.292, -0.5095, 117, '1')
This app connects refugees to volunteer translators via Facebook Messenger
(10, False, False, False, True, 0.0, 0.1, 0.2, 1.0, 0.0, 0.0, 945, '1')
Inside the hidden world of Indonesia's transgender women
(8, False, False, False, False, 0.0, 0.25, 0.12, 1.0, 0.0, 0.0, 528, '1')
&gt;Iran trying to use Twitter the same way Trump uses Tw

(10, False, False, False, False, 0.0, 0.1, 0.3, 0.657, 0.343, -0.6908, 316, '1')
Police in #Quebec now investigating a "lone wolf" after shooting at mosque
(12, False, False, False, False, 0.0, 0.42, 0.17, 0.826, 0.174, -0.2732, 971, '1')
Whoopsie!
(1, False, False, False, False, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 253, '1')
Second body found in search for missing Missouri woman
(9, False, False, False, False, 0.0, 0.22, 0.11, 0.784, 0.216, -0.296, 248, '1')
West Ham agree £25m fee with Marseille for Dimitri Payet  via @guardian_sport
(13, False, False, False, False, 0.185, 0.15, 0.54, 0.815, 0.0, 0.3612, 597, '1')
"This could be the match people use to decide who really is the greatest of all time."

Roger vs. Rafa, 3 a.m. ET:
(22, False, False, False, False, 0.175, 0.36, 0.09, 0.825, 0.0, 0.6666, 1059, '1')
#AbhinavBindra joined the re-constituted #TargetOlympicPodium (TOP) committee, as its chairman
(10, False, False, False, False, 0.0, 0.3, 0.3, 1.0, 0.0, 0.0, 171, '1')
The "pros" don't 

(14, False, False, False, True, 0.202, 0.36, 0.14, 0.798, 0.0, 0.5093, 157, '1')
This presidential election will be decided on a coin toss
(10, False, False, False, True, 0.0, 0.4, 0.0, 1.0, 0.0, 0.0, 176, '1')
The 100-day watch: Why every White House meeting seems to leak  via @HowardKurtz @MediaBuzzFNC
(15, False, False, False, False, 0.0, 0.07, 0.4, 0.844, 0.156, -0.34, 601, '1')
The man forcibly removed from a United flight went to the hospital for his injuries
(15, False, False, False, False, 0.177, 0.4, 0.07, 0.823, 0.0, 0.4215, 307, '1')
This doll-like dog's eyes will melt your heart
(8, True, False, False, True, 0.0, 0.25, 0.0, 1.0, 0.0, 0.0, 40, '1')
When #natashaexelby realises that she is on live #camera, she gets a #shock of her #life! 😂😂😂
(17, False, False, False, False, 0.0, 0.47, 0.0, 1.0, 0.0, 0.0, 142, '1')
Bernie Sanders has a podcast now, so you can "Feel the Bern" all year long.
(15, True, False, False, False, 0.0, 0.53, 0.2, 1.0, 0.0, 0.0, 269, '1')
Garden strimmer

7 mental tricks that US Olympic athletes use to perform under pressure
(12, False, False, True, False, 0.0, 0.25, 0.17, 0.709, 0.291, -0.4019, 140, '1')
In 1978, legendary 60 Minutes commentator Andy Rooney sounded off on Christmas cards:
(13, False, False, False, False, 0.0, 0.15, 0.23, 1.0, 0.0, 0.0, 494, '1')
Books for the Trump era  via @nytopinion
(8, False, False, False, False, 0.0, 0.25, 0.25, 1.0, 0.0, 0.0, 1099, '1')
Why are heart attacks striking healthy, young women?
(8, False, True, False, False, 0.233, 0.12, 0.0, 0.517, 0.25, -0.0516, 503, '1')
Stunning mural reading "My womanhood is not up for debate" appears in LA
(13, False, False, False, False, 0.178, 0.38, 0.08, 0.822, 0.0, 0.3818, 682, '1')
Graphic video shows the assassination of Russia's ambassador to Turkey by a man who shouted, "Don't forget Syria!"
(18, False, False, False, False, 0.08, 0.33, 0.22, 0.719, 0.201, -0.5463, 33, '1')
Smart people prefer to be alone!
(6, False, False, False, False, 0.333, 0.33, 0.0, 

Burger King is giving out 'adult toys' for Valentine's Day
(10, False, False, False, False, 0.211, 0.3, 0.4, 0.789, 0.0, 0.34, 354, '1')
Start your day with some #Entertainment news!
(7, True, False, False, False, 0.0, 0.43, 0.0, 1.0, 0.0, 0.0, 548, '1')
Two young girls critically wounded in separate weekend shootings on Chicago’s South Side:
(13, False, False, False, False, 0.0, 0.23, 0.31, 1.0, 0.0, 0.0, 200, '1')
Joy Villa's Trump dress at the #Grammys pays off with a boost in album sales
(15, False, False, False, False, 0.351, 0.4, 0.27, 0.649, 0.0, 0.7579, 159, '1')
Boom.
(1, False, False, False, False, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 178, '1')
Eight kinds of love to celebrate on #ValentinesDay
(8, False, False, False, False, 0.568, 0.38, 0.12, 0.432, 0.0, 0.836, 777, '1')
How Mongolia, once the world's fastest-growing economy, went bust  via @BV
(12, False, True, False, False, 0.0, 0.17, 0.17, 1.0, 0.0, 0.0, 524, '1')
'The Lego Batman Movie' tops the first strong weekend box office

Here's what everyone was watching at the movies this weekend
(10, False, False, False, True, 0.0, 0.5, 0.0, 1.0, 0.0, 0.0, 426, '1')
"F--- do I look like? For real. When would I ever say that?"

Russ Westbrook denies shouting "Thank you, Kyrie!"
(19, False, False, False, False, 0.241, 0.16, 0.21, 0.64, 0.119, 0.4314, 334, '1')
A&amp;E abruptly cancels KKK docu-series before it airs
(8, False, False, False, False, 0.0, 0.25, 0.25, 0.787, 0.213, -0.2263, 634, '1')
🎄🎄🎄 MERRY CHRISTMAS!!! 👶🐑🌟
(4, False, False, False, False, 0.63, 0.0, 0.5, 0.37, 0.0, 0.7277, 1260, '1')
Every year they try this line.
(6, False, False, False, True, 0.0, 0.33, 0.0, 1.0, 0.0, 0.0, 720, '1')
Not like he'll have much else to do, career-wise.
(9, False, False, False, False, 0.238, 0.44, 0.0, 0.762, 0.0, 0.3612, 396, '1')
Watch this guy dynamite a Christmas tree just in time for the holidays
(13, False, False, False, True, 0.301, 0.46, 0.08, 0.699, 0.0, 0.5106, 56, '1')
After Jo Cox: the unsung MPs quietly making 