In [1]:
import nltk 
from nltk.corpus import stopwords
import json
import arff # https://pypi.python.org/pypi/liac-arff
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [2]:
#object that holds a lot of our data

class NewsObject:
    id = '0'
    targetTitle = []
    targetDescription = ""
    targetKeywords = []
    targetParagraphs = []
    targetCaptions = []
    postText = []
    postMedia = []
    postTimestamp = ''
    #truthMedian = ''
    #truthMean = ''
    #truthMode = ''
    truthClass = ""
    #truthJudgments = []
    attributes = ()
    
    def __init__(self, line):
        
        self.id = line['id']
        self.targetTitle= line['targetTitle']
        self.targetKeywords = line['targetKeywords']
        self.targetParagraphs = line['targetParagraphs']
        self.targetCaptions = line['targetCaptions']
        self.postText = line['postText']
        self.postMedia = line['postMedia']
        self.postTimestamp = line['postTimestamp']
        
    def addTruth(self, line):
        #self.truthMedian = line['truthMedian']
        #self.truthMean = line['truthMean']
        #self.truthMode = line['truthMode']
        if line['truthClass'] == 'clickbait':
            self.truthClass = '1'
        else:
            self.truthClass = '0'
        #self.truthJudgments = line['truthJudgments']
        
    def extractFeatures(self):
        f1 = 0
        f2 = False
        f3 = False
        f4 = 0
        f5 = 0
        f6 = 0
        f7 = False
        f8 = False
        f9 = 0
        f10 = self.truthClass

        words = self.postText[0].split(' ')
        f1 = len(words)
        
        informal = ["you", "you're", "we", "our", "your", "my"]
        questionWords = ['who', 'what', 'where', 'how', 'why']
        stopWords = set(stopwords.words('english'))
        
        if words[0].isdigit():
            f4 = True
        if words[0].lower() in questionWords:
            f3 = True
        for word in words:
            if word.lower() in informal:
                f2 = True
            if word.lower() == 'this':
                f5 = True
        
        text = nltk.word_tokenize(item.postText[0])
        tokenizedList = nltk.pos_tag(text)
        
        numProper = 0
        numStop = 0
        for partOfSpeech in tokenizedList:
            if partOfSpeech[1] == 'NNP':
                numProper += 1
            elif(partOfSpeech[0] in stopWords):
                numStop += 1
        
        f7 = round(numStop/f1, 2)
        f8 = round(numProper/f1, 2)
        
        sentence = self.postText[0]
        sid = SentimentIntensityAnalyzer()
        ss = sid.polarity_scores(sentence)
        f6 = ss['neg']   
        
        feat = (f1, f2, f3, f4, f5, f6, f7, f8, f10)
        self.attributes = feat
        
            

In [3]:
#import files
instances = []

with open('dataset/instances_train.jsonl') as file:
    for line in file:
        temp = NewsObject(json.loads(line))
        instances.append(temp)
        
with open('dataset/truth_train.jsonl') as file2:
    i = 0
    for line in file2:
        instances[i].addTruth(json.loads(line))
        i += 1
#print(instances[0].targetKeywords)

In [4]:
for item in instances:
    item.extractFeatures()

text = nltk.word_tokenize(instances[0].postText[0])
tokenizedList = nltk.pos_tag(text)
sentence = instances[0].postText[0]
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(sentence)
print(sentence)
print(ss)

Apple's iOS 9 'App thinning' feature will give your phone's storage a boost
{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'compound': 0.4019}


In [5]:
text = nltk.word_tokenize(instances[0].postText[0])
tokenizedList = nltk.pos_tag(text)
print(tokenizedList)
sentence = instances[0].postText[0]
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(sentence)
print(sentence)
print(ss)

[('Apple', 'NNP'), ("'s", 'POS'), ('iOS', 'NN'), ('9', 'CD'), ("'App", 'CD'), ('thinning', 'VBG'), ("'", "''"), ('feature', 'NN'), ('will', 'MD'), ('give', 'VB'), ('your', 'PRP$'), ('phone', 'NN'), ("'s", 'POS'), ('storage', 'NN'), ('a', 'DT'), ('boost', 'NN')]
Apple's iOS 9 'App thinning' feature will give your phone's storage a boost
{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'compound': 0.4019}


In [6]:
### dump to arff
features = [("word count", 'NUMERIC'),
            ("contains informal pronouns", ['True', 'False']),
            ("Begins w/ question word", ['True', 'False']),
            ("Begins w/ number", ['True', 'False']),
            ("Contains 'this'", ['True', 'False']),
            ("percent stop words", 'NUMERIC'),
            ("Percent proper nouns", 'NUMERIC'),
            ("Neg sent", 'NUMERIC'),
            ("label", ['0', '1'])]
data = {}
data.setdefault('attributes', features)
data.setdefault('description', '')
data.setdefault('relation', 'clickbait_sample')
data.setdefault('data', [])
for item in instances:
    if(item.truthClass == '1'):
        print(item.postText[0])
        print(item.attributes)
    data['data'].append(item.attributes)

with open('sample_train.arff', 'w') as f:
    f.write(arff.dumps(data))

U.S. Soccer should start answering tough questions about Hope Solo, @eric_adelson writes.
(12, False, False, 0, 0, 0.104, 0.17, 0.42, '1')
(16, True, False, 0, 0, 0.293, 0.31, 0.0, '1')
13 classic ’00s songs that were actually meant for other artists
(11, False, False, True, 0, 0.0, 0.36, 0.0, '1')
Tourists detained in Malaysia for getting naked on sacred mountain:
(10, False, False, 0, 0, 0.231, 0.3, 0.1, '1')
The brutal dictatorship the world keeps ignoring
(7, False, False, 0, 0, 0.576, 0.14, 0.0, '1')
RT @BBCWalesNews: Caerphilly farmer may get payout worth hundreds of thousands after 24 year wait
(15, False, False, 0, 0, 0.0, 0.13, 0.2, '1')
Man dies when car plunges from parking garage
(8, False, False, 0, 0, 0.0, 0.25, 0.0, '1')
5 inconsistencies in 'Jurassic World' that will drive scientists crazy @BI_Video
(11, False, False, True, 0, 0.211, 0.27, 0.27, '1')
How do dogs donate blood?
(5, False, True, 0, 0, 0.0, 0.2, 0.0, '1')
These global warming skeptics have their own confere

Joko #Widodo expected to talk trade, trust, tourism and foreign tensions on Australia trip
(14, False, False, 0, 0, 0.15, 0.21, 0.21, '1')
"My Life as a Zucchini" is Switzerland’s magical animated entry in this year’s Oscars. @JoeMorgenstern's review:
(16, False, False, 0, 0, 0.0, 0.44, 0.44, '1')
President Trump tweets criticism of FBI for its inability to stop leaks to the media:
(15, False, False, 0, 0, 0.394, 0.4, 0.2, '1')
Washington Post's Editorial Board: The White House just told transgender students they’re on their own
(15, False, False, 0, 0, 0.0, 0.4, 0.4, '1')
It appears Magic and Bird have been talking.
(8, False, False, 0, 0, 0.0, 0.38, 0.25, '1')
In addition to all the rapes, they provide a convenient excuse to expand the surveillance state.
(16, False, False, 0, 0, 0.224, 0.44, 0.0, '1')
Hunt master investigated by police after allegedly whipping saboteur with riding crop
(12, False, False, 0, 0, 0.0, 0.25, 0.08, '1')
Model risked her life and broke the law... all for 

Young immigrants on edge waiting to see if Trump will deport them
(12, False, False, 0, 0, 0.0, 0.42, 0.08, '1')
Trump Hotels will triple locations in U.S. expansion, CEO says
(10, False, False, 0, 0, 0.0, 0.2, 0.4, '1')
21 savage Kourtney Kardashian burns we still haven’t recovered from
(10, True, False, True, 0, 0.25, 0.4, 0.2, '1')
SENATOR SCHUMER: Trump should label China a 'currency manipulator'
(9, False, False, 0, 0, 0.0, 0.22, 0.22, '1')
Missing woman's relatives find missing man's body in Missouri |
(10, False, False, 0, 0, 0.386, 0.1, 0.1, '1')
Six killed as helicopter crashes in central Italy close to avalanche site
(12, False, False, 0, 0, 0.29, 0.25, 0.17, '1')
If you think you're losing your memory, you might be right
(11, True, False, 0, 0, 0.206, 0.45, 0.0, '1')
Analysis: Trump kills TPP, giving China its first big win
(10, False, False, 0, 0, 0.21, 0.1, 0.3, '1')
Watchdog group’s Trump suit hinges on risky legal argument
(9, False, False, 0, 0, 0.364, 0.22, 0.33, '1')


(7, False, False, 0, 0, 0.0, 0.14, 0.57, '1')
Coming this fall
(3, False, False, 0, 0, 0.0, 0.33, 0.0, '1')
A refugee from Congo swapped a nursing position for a dishwashing job to build a life in the U.S.
(19, False, False, 0, 0, 0.0, 0.42, 0.11, '1')
Ivanka Trump and Jared Kushner pay $15,000 a month to rent their new home in D.C.'s Kalorama neighborhood
(18, False, False, 0, 0, 0.08, 0.28, 0.33, '1')
Everything we learned from our 'Spider-man: Homecoming' set visit
(9, True, False, 0, 0, 0.0, 0.33, 0.0, '1')
Dems have enough votes to filibuster Gorsuch, increasing odds of 'nuclear option'
(12, False, False, 0, 0, 0.0, 0.25, 0.08, '1')
It's official: Millennials are reckless drivers
(6, False, False, 0, 0, 0.351, 0.17, 0.0, '1')
South Africa's main opposition party starts a disciplinary process against its former leader
(13, False, False, 0, 0, 0.0, 0.23, 0.15, '1')
These 9 states are where taxpayers get the most for their tax dollars
(13, False, False, 0, 0, 0.0, 0.46, 0.0, '1')
Thi

(12, False, True, 0, 0, 0.0, 0.08, 0.08, '1')
After promotions, managers must learn to shift gears
(8, False, False, 0, 0, 0.0, 0.12, 0.0, '1')
.@Infosys employee strangled to death inside her office!
(8, False, False, 0, 0, 0.562, 0.25, 0.12, '1')
Miracle milestone: premature triplets start first day of #school #goodnews  🎒
(12, False, False, 0, 0, 0.0, 0.08, 0.08, '1')
How big a problem is crime committed by immigrants?
(9, False, True, 0, 0, 0.466, 0.33, 0.0, '1')
America must lead the free world – the alternative is chaos | Natalie Nougayrède
(14, False, False, 0, 0, 0.218, 0.21, 0.29, '1')
$30 BILLION HEDGE FUND: We may be at 'the dawn of a new era'
(14, True, False, 0, 0, 0.0, 0.29, 0.21, '1')
Master networker @JonLevyTLB explains how to meet the most interesting people at any event
(14, False, False, 0, 0, 0.0, 0.43, 0.14, '1')
European space bosses reveal plan for 50m high lunar 'dome of contemplation'
(12, False, False, 0, 0, 0.0, 0.17, 0.0, '1')
Trump's tweet about Chelsea Ma

Searching for a job? Pay attention to these 5 mistakes that keep you from getting interviews
(16, True, False, 0, 0, 0.245, 0.44, 0.06, '1')
Two-thirds of all cancerous mutations occur entirely by chance
(9, False, False, 0, 0, 0.0, 0.33, 0.0, '1')
Beautiful vacations on the dime: here’s 30 cheap but beautiful places to go in 2017
(15, False, False, 0, 0, 0.0, 0.47, 0.0, '1')
John Mayer on why he left pop music’s A-list and how ready he is, emotionally and musically, to return
(19, False, False, 0, 0, 0.0, 0.53, 0.16, '1')
Is there more to this #LondonAttacks photo than meets the eye?
(11, False, False, 0, 0, 0.0, 0.55, 0.09, '1')
Muslim teen denied entry into US despite being a citizen
(10, False, False, 0, 0, 0.266, 0.3, 0.2, '1')
Americans leaving large debts when they die
(7, False, False, 0, 0, 0.394, 0.29, 0.0, '1')
Pakistan reinstates secret military courts despite criticism
(7, False, False, 0, 0, 0.0, 0.0, 0.14, '1')
Everything you need to know about the #Sweet16
(8, True, Fal