In [1]:
import nltk 
from nltk.corpus import stopwords
import json
import arff # https://pypi.python.org/pypi/liac-arff
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

contractionsFile = open("english-contractions-list.txt", "r")
contractions = []
for line in contractionsFile:
    contractions = line.split(',')

stopWords = set(stopwords.words('english'))




In [2]:
#object that holds a lot of our data

class NewsObject:
    id = '0'
    targetTitle = []
    targetDescription = ""
    targetKeywords = []
    targetParagraphs = []
    targetCaptions = []
    postText = []
    postMedia = []
    postTimestamp = ''
    #truthMedian = ''
    #truthMean = ''
    #truthMode = ''
    truthClass = ""
    #truthJudgments = []
    attributes = ()
    
    def __init__(self, line):
        
        self.id = line['id']
        self.targetTitle= line['targetTitle']
        self.targetKeywords = line['targetKeywords']
        self.targetParagraphs = line['targetParagraphs']
        self.targetCaptions = line['targetCaptions']
        self.postText = line['postText']
        self.postMedia = line['postMedia']
        self.postTimestamp = line['postTimestamp']
        
    def addTruth(self, line):
        #self.truthMedian = line['truthMedian']
        #self.truthMean = line['truthMean']
        #self.truthMode = line['truthMode']
        if line['truthClass'] == 'clickbait':
            self.truthClass = '1'
        else:
            self.truthClass = '0'
        #self.truthJudgments = line['truthJudgments']
        


In [3]:
#import files
instances = []

with open('dataset/instances_train.jsonl') as file:
    for line in file:
        temp = NewsObject(json.loads(line))
        instances.append(temp)
        
    
with open('dataset/truth_train.jsonl') as file2:
    i = 0
    for line in file2:
        if instances[i].id == str(i):
            instances[i].addTruth(json.loads(line))
        i += 1
#print(instances[0].targetKeywords)

In [4]:
def extractFeatures(newsObject):
    feat = {
        "wordCount": 0, "informal": False, "beginsQuestion": False, "beginsNum": False,
        "beginsThis": False, "titleStopPerc": 0, "titleProperPerc": 0, "posSent": 0,
        "neuSent": 0, "negSent": 0, "compoundSent": 0, "articleWords": 0, "titlePercVerbs": 0,
        "unigrams": 0, "bigrams": 0, "trigrams": 0, "fourgrams": 0, "fivegrams": 0, "percNouns": 0,"unigramsArticle": 0,
        "bigramsArticle": 0, "trigramsArticle": 0, "fourgramsArticle": 0, "fivegramsArticle": 0, "percAdj": 0,
        "percAdv": 0,  "percentKeywordsInTitle": 0, "sentDiffTitleBody": 0, 
        "hasContractions": False, "has!" : False, "has?": False, "hasQuote": False, 
        "bodyPercProper": 0, "bodyPercAdv": 0, "bodyQuoteNum": 0, "lengthOfCaption": 0, "PosSentDifference": 0,
        "NeuSentDifference": 0, "NegSentDifference": 0, "percPersonal" : 0, "percPersonalBody": 0, 
        "hasThat": False, "startsWithThat": False, "hasThis": False,
        "label": newsObject.truthClass
    }
    
    words = newsObject.postText[0].split(' ')
    keywords =  [x.lower() for x in newsObject.targetKeywords]
    text = nltk.word_tokenize(newsObject.postText[0])
    tokenizedList = nltk.pos_tag(text)
    
    
    feat['wordCount'] = len(words)
    
    if words[0].isdigit():
        feat['beginsNum'] = True
    elif words[0].lower() == 'this':
        feat['beginsThis'] = True
        feat['hasThis'] = True
    elif words[0].lower() == 'that':
        feat['startsWithThat'] = True
        feat['hasThat'] = True
    
    numSame = 0
    
    for word in words:
        if word.lower() in keywords:
            numSame += 1
        if word.lower() in contractions:
            feat['hasContractions'] = True
        elif word.lower() == 'this':
            feat['hasThis'] = True
        elif word.lower() == 'that':
            feat['hasThat'] = True
  
    numProper = 0
    numStop = 0
    numVerb = 0
    numNoun = 0
    numAdj = 0
    numAdv = 0
    numPersonal = 0
    
    if len(tokenizedList) > 0:
        firstWord = tokenizedList[0]
        if firstWord[1] == 'MD' or firstWord[1] == 'WRB':
            feat['beginsQuestion'] = True
    
    for partOfSpeech in tokenizedList:
        if partOfSpeech[1] == 'NNP':
            numProper += 1
        elif partOfSpeech[1] == 'VB' or partOfSpeech[1] == 'VBP' or partOfSpeech[1] == 'VBD' or partOfSpeech[1] == 'VBN':
            numVerb += 1
        elif partOfSpeech[1] == 'NN':
            numNoun += 1
        elif partOfSpeech[1] == 'PRP':
            feat['informal'] = True
            numPersonal += 1
        elif partOfSpeech[1] == 'JJ':
            numAdj += 1
        elif partOfSpeech[1] == 'RB':
            numAdv += 1
        elif partOfSpeech[1] == '.':
            if partOfSpeech[0] == '?':
                feat['has?'] = True
            elif partOfSpeech[0] == '!':
                feat['has!'] = True
        elif partOfSpeech[1] == "''" or partOfSpeech[1] == '""':
            feat['hasQuote'] = True
        if(partOfSpeech[0].lower() in stopWords):
            numStop += 1

    feat['titleStopPerc'] = round(numStop/feat['wordCount'], 2)
    feat['titleProperPerc'] = round(numProper/feat['wordCount'], 2)
    feat['titlePercVerbs'] = round(numVerb/feat['wordCount'], 2)
    feat['percPersonal'] = round(numPersonal/feat['wordCount'], 2)
    feat['percNouns'] = round(numNoun/feat['wordCount'], 2)
    feat['percAdj'] = round(numAdj/feat['wordCount'], 2)
    feat['percAdv'] = round(numAdv/feat['wordCount'], 2)
    feat['percentKeywordsInTitle'] = round(numSame/feat['wordCount'], 2)
    
    feat['unigrams'] = sum(Counter(ngrams(text,1)).values())
    feat['bigrams'] = sum(Counter(ngrams(text,2)).values())
    feat['trigrams'] = sum(Counter(ngrams(text,3)).values())
    feat['fourgrams'] = sum(Counter(ngrams(text,4)).values())
    feat['fivegrams'] = sum(Counter(ngrams(text,5)).values())
    
    sentence = newsObject.postText[0]
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    feat['posSent'] = ss['pos']
    feat['neuSent'] = ss['neu']
    feat['negSent'] = ss['neg']
    feat['compoundSent'] = ss['compound']
    
    paraSent = 0
    paraNegSent = 0
    paraNeuSent = 0
    paraPosSent = 0
    countPara = 0
    articleNumProp = 0
    articleNumAdv = 0
    numPersonalBody = 0 
    ss2 = {}
    for item in newsObject.targetParagraphs:
        feat['articleWords'] += len(item.split())
        text = nltk.word_tokenize(item)
        tokenizedList = nltk.pos_tag(text)
        feat['unigramsArticle'] += sum(Counter(ngrams(text,1)).values())
        feat['bigramsArticle'] += sum(Counter(ngrams(text,2)).values())
        feat['trigramsArticle'] += sum(Counter(ngrams(text,3)).values())
        feat['fourgramsArticle'] += sum(Counter(ngrams(text,4)).values())
        feat['fivegramsArticle'] += sum(Counter(ngrams(text,5)).values())
        ss2 = sid.polarity_scores(item)
        countPara += 1
        paraSent += ss2['compound']
        paraNegSent += ss2['neg']
        paraNeuSent += ss2['neu']
        paraPosSent += ss2['pos']
        for word in tokenizedList:
            if word[1] == 'NNP':
                articleNumProp += 1
            elif word[1] == 'RB':
                articleNumAdv += 1
            elif word[1] == 'PRP':
                numPersonalBody += 1
            elif word[1] == "''" or word[1] == '""':
                feat['bodyQuoteNum'] += 1
    
    if feat['articleWords'] > 0:
        feat['bodyPercProper'] = round(articleNumProp/feat['articleWords'], 2)
        feat['bodyPercAdv'] = round(articleNumAdv/feat['articleWords'], 2)
        feat['percPersonalBody'] = round(numPersonalBody/feat['articleWords'], 2)
    
    if countPara > 0:
        paraSent = paraSent/countPara
        paraNegSent = paraNegSent/countPara
        paraNeuSent = paraNeuSent/countPara
        paraPosSent = paraPosSent/countPara
    
    feat['sentDiffTitleBody'] = abs(feat['compoundSent'] - paraSent)
    feat['NegSentDifference'] = abs(feat['negSent'] - paraNegSent)
    feat['NeuSentDifference'] = abs(feat['neuSent'] - paraNeuSent)
    feat['PosSentDifference'] = abs(feat['posSent'] - paraPosSent)



    
    for item in newsObject.targetCaptions:
        feat['lengthOfCaption'] += len(item)
        
        
    featTuple = (feat['wordCount'], feat['informal'], feat['beginsQuestion'], feat['beginsNum'], 
            feat['beginsThis'], feat['titleStopPerc'], feat['titleProperPerc'], 
            feat['posSent'], feat['neuSent'], feat['negSent'], feat['compoundSent'], feat['articleWords'], 
            feat['titlePercVerbs'], feat['unigrams'], feat['bigrams'], feat['trigrams'], feat['fourgrams'], 
            feat['fivegrams'], feat['percNouns'], feat['unigramsArticle'],
            feat['bigramsArticle'], feat['trigramsArticle'], feat['fourgramsArticle'], feat['fivegramsArticle'], 
            feat['percAdj'], feat['percAdv'], feat['percentKeywordsInTitle'], 
            feat['sentDiffTitleBody'], feat['hasContractions'], feat['has!'], feat['has?'], feat['hasQuote'],
            feat['bodyPercProper'], feat['bodyPercAdv'], feat['bodyQuoteNum'], 
            feat['lengthOfCaption'], feat['PosSentDifference'], feat['NeuSentDifference'],
            feat['NegSentDifference'],feat['percPersonal'], feat['percPersonalBody'],
            feat['hasThat'], feat['startsWithThat'], feat['hasThis'],
            feat['label'])
    newsObject.attributes = featTuple
    return newsObject
            

In [5]:
for item in instances:
    item = extractFeatures(item)



In [15]:
### dump to arff
features = [("word count", 'NUMERIC'),
            ("contains informal pronouns", ['True', 'False']),
            ("Begins w/ question word", ['True', 'False']),
            ("Begins w/ number", ['True', 'False']),
            ("Begins with 'this'", ['True', 'False']),
            ("percent stop words", 'NUMERIC'),
            ("Percent proper nouns", 'NUMERIC'),
            ("Pos sent", 'NUMERIC'),
            ("Neu sent", 'NUMERIC'),
            ("Neg sent", 'NUMERIC'),
            ("Compound sent", 'NUMERIC'),
            ("Article Length", 'NUMERIC'),
            ("Percent verbs", 'NUMERIC'),
            ("Unigrams", 'NUMERIC'),
            ("Bigrams", 'NUMERIC'),
            ("Trigrams", 'NUMERIC'),
            ("Fourgrams", 'NUMERIC'),
            ("Fivegrams", 'NUMERIC'),
            ("Percent nouns", 'NUMERIC'),
            ("Unigrams article body", 'NUMERIC'),
            ("Bigrams article body", 'NUMERIC'),
            ("Trigrams article body", 'NUMERIC'),
            ("Fourgrams article body", 'NUMERIC'),
            ("Fivegrams article body", 'NUMERIC'),
            ("Percent adj", 'NUMERIC'),
            ("Percent adv", 'NUMERIC'),
            ("Percent keywords in title", 'NUMERIC'),
            ("Difference in sent body v. title", 'NUMERIC'),
            ('Has contractions', ['True', 'False']),
            ('Has exclamation', ['True', 'False']),
            ("Has question", ['True', 'False']),
            ('Title has quote', ['True','False']),
            ("Body percent proper", 'NUMERIC'),
            ("Body percent adv", 'NUMERIC'),
            ("Body num quotes", 'NUMERIC'),
            ("Length Of Caption", 'NUMERIC'),
            ("Pos Sent difference", 'NUMERIC'),
            ("Neu sent difference", 'NUMERIC'),
            ("Neg sent difference", 'NUMERIC'),
            ("Percent personal pronouns", 'NUMERIC'),
            ("Percent perosnal pronouns article", 'NUMERIC'),
            ("Has that", ['True', 'False']),
            ('Starts with that', ['True', 'False']),
            ('Has this', ['True', 'False']),
            ("label", ['0', '1'])]
data = {}
data.setdefault('attributes', features)
data.setdefault('description', '')
data.setdefault('relation', 'clickbait_sample')
data.setdefault('data', [])
for item in instances:
    if item.attributes[44] == '0':
        print(item.postText[0])
        print(item.attributes)
    data['data'].append(item.attributes)

with open('sample_train.arff', 'w') as f:
    f.write(arff.dumps(data))

Apple's iOS 9 'App thinning' feature will give your phone's storage a boost
(13, False, False, False, False, 0.23, 0.08, 0.213, 0.787, 0.0, 0.4019, 715, 0.08, 16, 15, 14, 13, 12, 0.38, 789, 756, 723, 690, 657, 0.0, 0.0, 0.15, 0.3029205882352941, False, False, False, True, 0.06, 0.04, 7, 2794, 0.15370588235294116, 0.0951764705882353, 0.02911764705882353, 0.0, 0.03, False, False, False, '0')
RT @kenbrown12: Emerging market investors are doing their best Monty Pythons--"Run away, run away"
(14, False, False, False, False, 0.21, 0.21, 0.244, 0.756, 0.0, 0.6369, 69, 0.14, 21, 20, 19, 18, 17, 0.21, 80, 77, 74, 71, 68, 0.0, 0.07, 0.0, 0.7504333333333334, False, False, False, True, 0.07, 0.04, 0, 2074, 0.127, 0.09166666666666667, 0.035333333333333335, 0.0, 0.0, False, False, False, '0')
How theme parks like Disney World left the middle class behind
(11, False, True, False, False, 0.18, 0.18, 0.2, 0.8, 0.0, 0.3612, 1745, 0.09, 11, 10, 9, 8, 7, 0.09, 2129, 2089, 2049, 2009, 1970, 0.18, 0.0, 0.0,

(12, False, False, False, False, 0.5, 0.0, 0.276, 0.724, 0.0, 0.6369, 599, 0.0, 12, 11, 10, 9, 8, 0.25, 709, 690, 671, 652, 633, 0.0, 0.0, 0.0, 0.437242105263158, False, False, False, False, 0.02, 0.03, 0, 0, 0.2115263157894737, 0.18299999999999983, 0.028526315789473688, 0.0, 0.02, False, False, False, '0')
RT @YahooSports: Officials recommend releasing Oscar Pistorius from prison on Aug. 21 to go under house arrest. http
(18, False, False, False, False, 0.22, 0.28, 0.108, 0.647, 0.246, -0.4939, 633, 0.11, 21, 20, 19, 18, 17, 0.22, 720, 700, 681, 663, 645, 0.0, 0.0, 0.0, 0.329485, False, False, False, False, 0.14, 0.06, 8, 414, 0.0463, 0.21904999999999986, 0.17379999999999998, 0.0, 0.02, False, False, False, '0')
Rebecca Minnock: grandmother of missing child is jailed -
(9, False, False, False, False, 0.22, 0.22, 0.0, 0.526, 0.474, -0.6597, 1236, 0.11, 10, 9, 8, 7, 6, 0.22, 1413, 1364, 1315, 1266, 1217, 0.0, 0.0, 0.0, 0.7040938775510204, False, False, False, False, 0.13, 0.03, 22, 781,

.@SarahPalinUSA applauds President-elect @realDonaldTrump for being a "fearless leader."
(9, False, False, False, False, 0.33, 0.33, 0.469, 0.531, 0.0, 0.6486, 159, 0.0, 15, 14, 13, 12, 11, 0.22, 187, 186, 185, 184, 183, 0.11, 0.0, 0.0, 0.19530000000000003, False, False, False, True, 0.4, 0.01, 3, 37, 0.33999999999999997, 0.257, 0.083, 0.0, 0.01, False, False, False, '0')
Hilarious tattoo fails show how things go badly wrong when the inker can't spell
(14, False, False, False, False, 0.21, 0.07, 0.124, 0.461, 0.415, -0.743, 317, 0.21, 15, 14, 13, 12, 11, 0.14, 362, 348, 334, 320, 306, 0.0, 0.21, 0.0, 0.73573125, True, False, False, False, 0.01, 0.05, 9, 3973, 0.023125000000000007, 0.20087500000000008, 0.3026875, 0.0, 0.04, False, False, False, '0')
Morocco 'bans sale and production' of the burka
(8, False, False, False, False, 0.38, 0.12, 0.0, 1.0, 0.0, 0.0, 611, 0.0, 9, 8, 7, 6, 5, 0.38, 711, 687, 663, 639, 615, 0.0, 0.0, 0.0, 0.31801250000000003, False, False, False, True, 0.09, 0.03

Pistol-packing granny scares off armed crook at home
(8, False, False, False, False, 0.25, 0.0, 0.0, 0.745, 0.255, -0.34, 169, 0.12, 8, 7, 6, 5, 4, 0.38, 198, 189, 180, 171, 162, 0.12, 0.0, 0.0, 0.20417777777777782, False, False, False, False, 0.07, 0.05, 3, 0, 0.06344444444444444, 0.12100000000000011, 0.18444444444444447, 0.0, 0.11, False, False, False, '0')
The top movies of the last four years star women
(10, False, False, False, False, 0.3, 0.0, 0.167, 0.833, 0.0, 0.2023, 268, 0.0, 10, 9, 8, 7, 6, 0.0, 327, 318, 309, 300, 291, 0.2, 0.0, 0.0, 0.27544444444444444, False, False, False, False, 0.22, 0.05, 0, 0, 0.132, 0.09666666666666679, 0.03533333333333333, 0.0, 0.01, False, False, False, '0')
A former contestant on The Jump is telling new ones to quit before they get hurt
(16, True, False, False, False, 0.44, 0.06, 0.0, 0.805, 0.195, -0.5267, 364, 0.19, 16, 15, 14, 13, 12, 0.06, 438, 426, 414, 402, 390, 0.12, 0.0, 0.06, 0.5066833333333333, False, False, False, False, 0.12, 0.04, 1, 

(16, False, False, False, False, 0.19, 0.25, 0.075, 0.699, 0.226, -0.4215, 231, 0.06, 19, 18, 17, 16, 15, 0.25, 256, 246, 237, 228, 219, 0.06, 0.0, 0.0, 0.4189, False, False, False, False, 0.2, 0.03, 3, 57, 0.004299999999999984, 0.1522000000000001, 0.14800000000000002, 0.0, 0.02, False, False, False, '0')
In honor of the 30th anniversary of 'The Joshua Tree,' @U2 is planning a 2017 tour.
(16, False, False, False, False, 0.38, 0.25, 0.186, 0.814, 0.0, 0.4939, 128, 0.0, 20, 19, 18, 17, 16, 0.19, 155, 152, 149, 146, 143, 0.12, 0.0, 0.0, 0.26783333333333337, False, False, False, True, 0.18, 0.04, 4, 19, 0.005333333333333357, 0.019000000000000017, 0.013666666666666667, 0.0, 0.03, False, False, False, '0')
Spree of Obama actions revives GOP concerns over 'midnight' regulations, agenda  via @foxnewspolitics
(14, False, False, False, False, 0.14, 0.29, 0.178, 0.822, 0.0, 0.3818, 932, 0.0, 16, 15, 14, 13, 12, 0.14, 1095, 1066, 1037, 1009, 981, 0.0, 0.0, 0.0, 0.18934482758620685, False, False, F

(15, False, False, False, False, 0.13, 0.13, 0.0, 0.757, 0.243, -0.6705, 1239, 0.0, 18, 17, 16, 15, 14, 0.2, 1425, 1381, 1337, 1296, 1256, 0.0, 0.0, 0.0, 0.37389545454545453, False, False, False, False, 0.13, 0.04, 16, 3424, 0.05081818181818181, 0.016886363636363644, 0.06774999999999998, 0.0, 0.03, False, False, False, '0')
One in three victims of domestic abuse are now men
(10, False, False, False, False, 0.4, 0.0, 0.0, 0.552, 0.448, -0.7579, 550, 0.1, 10, 9, 8, 7, 6, 0.1, 622, 602, 582, 562, 542, 0.1, 0.1, 0.0, 0.19078499999999998, False, False, False, False, 0.05, 0.04, 2, 18, 0.04229999999999999, 0.21044999999999991, 0.25264999999999993, 0.0, 0.02, False, False, False, '0')
Nasa and Stephen Hawking working on nano-starship that can travel 1/5th speed of light
(14, False, False, False, False, 0.36, 0.21, 0.0, 1.0, 0.0, 0.0, 335, 0.07, 14, 13, 12, 11, 10, 0.14, 385, 372, 359, 346, 333, 0.07, 0.0, 0.0, 0.19750769230769233, False, False, False, False, 0.12, 0.04, 1, 0, 0.08976923076923

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


Wayne Rooney has equalled the record – but what does the future hold for striker?  via @guardian_sport
(18, False, False, False, False, 0.39, 0.11, 0.0, 1.0, 0.0, 0.0, 717, 0.06, 19, 18, 17, 16, 15, 0.39, 834, 820, 806, 792, 778, 0.0, 0.0, 0.0, 0.5573142857142858, False, False, True, False, 0.1, 0.05, 0, 133, 0.1545, 0.17978571428571433, 0.02514285714285714, 0.0, 0.06, False, False, False, '0')
Not as big as the one your Iran deal's gonna cause.
(11, False, False, False, False, 0.45, 0.09, 0.0, 1.0, 0.0, 0.0, 230, 0.0, 14, 13, 12, 11, 10, 0.27, 265, 250, 235, 222, 212, 0.09, 0.18, 0.0, 0.13650666666666667, False, False, False, False, 0.31, 0.03, 0, 1637, 0.07246666666666667, 0.07366666666666666, 0.0012, 0.0, 0.02, False, False, False, '0')
Morecambe supporters pay manager Jim Bentley’s £1,000 FA charge  via @guardian_sport
(12, False, False, False, False, 0.08, 0.42, 0.218, 0.677, 0.105, 0.3612, 207, 0.17, 14, 13, 12, 11, 10, 0.25, 233, 229, 225, 221, 217, 0.08, 0.0, 0.0, 0.450075, Fal

In [13]:
text1 = nltk.word_tokenize("that sux?")
tokenizedList = nltk.pos_tag(text1)
grammar = "NP: {<DT>?<JJ>*<NN>}"

#cp = nltk.RegexpParser(grammar) 
#result = cp.parse(tokenizedList)
print(instances[0].targetKeywords)
print(tokenizedList) 
print(stopWords)

Apple,gives,gigabytes,iOS,9,app,thinning,feature,finally,phone,s,storage,boost
[('that', 'DT'), ('sux', 'NN'), ('?', '.')]
{'wouldn', 'ourselves', 'only', 'wasn', 'off', 'then', 'mustn', 'll', 'she', 'the', 'too', 'why', 'down', 'don', 'having', 'into', 'by', 'himself', 'those', 'how', 'am', 'o', 'no', 'nor', 'a', 'under', 'what', 'both', 'other', 'at', 've', 'just', 'its', 'against', 'all', 'because', 'or', 'ain', 'yours', 'should', 'who', 'ma', 'yourself', 'yourselves', 'is', 'an', 'further', 'hers', 'were', 'about', 'on', 'between', 'isn', 'from', 'it', 'did', 'than', 'has', 'our', 'up', 'not', 'are', 'him', 'was', 'y', 'won', 'when', 'hasn', 'needn', 'd', 'of', 'his', 'that', 'shan', 'and', 'shouldn', 'does', 'there', 'ours', 'you', 'each', 'most', 'they', 'had', 'do', 'her', 'theirs', 'own', 'herself', 'this', 'have', 'i', 'doesn', 'in', 'some', 'we', 'before', 'with', 'myself', 'to', 'out', 'me', 'doing', 'my', 'again', 'here', 'weren', 's', 'as', 'such', 'themselves', 'while', '