In [9]:
import nltk 
from nltk.corpus import stopwords
from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile
import json
import arff # https://pypi.python.org/pypi/liac-arff
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import word_tokenize, TweetTokenizer
from nltk.util import ngrams
from collections import Counter

contractionsFile = open("english-contractions-list.txt", "r")
contractions = []
for line in contractionsFile:
    contractions = line.split(',')

stopWords = set(stopwords.words('english'))
sid = SentimentIntensityAnalyzer()
tknz = TweetTokenizer()
oauth = {"app_key":"", 
           "app_secret":"", 
           "oauth_token":"", 
           "oauth_token_secret":""}

In [21]:
#object that holds a lot of our data

class NewsObject:
    id = '0'
    targetTitle = []
    targetDescription = ""
    targetKeywords = []
    targetParagraphs = []
    targetCaptions = []
    postText = []
    postMedia = []
    postTimestamp = ''
    #truthMedian = ''
    #truthMean = ''
    #truthMode = ''
    truthClass = "0"
    #truthJudgments = []
    attributes = ()
    
    def __init__(self, line):
        
        self.id = line['id']
        self.targetTitle= line['targetTitle']
        self.targetKeywords = line['targetKeywords']
        self.targetParagraphs = line['targetParagraphs']
        self.targetCaptions = line['targetCaptions']
        self.postText = line['postText']
        self.postMedia = line['postMedia']
        self.postTimestamp = line['postTimestamp']
        
    def addTruth(self, line):
        #self.truthMedian = line['truthMedian']
        #self.truthMean = line['truthMean']
        #self.truthMode = line['truthMode']
        if line['truthClass'] == 'clickbait':
            self.truthClass = '1'
        else:
            self.truthClass = '0'
        #self.truthJudgments = line['truthJudgments']
        


In [22]:
#import files
instances = []

with open('dataset/instances_test.jsonl') as file:
    for line in file:
        temp = NewsObject(json.loads(line))
        instances.append(temp)
        
    


In [17]:
with open('dataset/truth_train.jsonl') as file2:
    i = 0
    for line in file2:
        truth = json.loads(line)
        instances[int(truth['id'])].addTruth(truth)
#print(instances[0].targetKeywords)

In [23]:
def extractFeatures(newsObject):
    feat = {
        "wordCount": 0, "informal": False, "beginsQuestion": False, "beginsNum": False,
        "beginsThis": False, "titleStopPerc": 0, "titleProperPerc": 0, "posSent": 0,
        "neuSent": 0, "negSent": 0, "compoundSent": 0, "articleWords": 0, "titlePercVerbs": 0,
        "unigrams": 0, "bigrams": 0, "trigrams": 0, "fourgrams": 0, "fivegrams": 0, "percNouns": 0,"unigramsArticle": 0,
        "bigramsArticle": 0, "trigramsArticle": 0, "fourgramsArticle": 0, "fivegramsArticle": 0, "percAdj": 0,
        "percAdv": 0,  "percentKeywordsInTitle": 0, "sentDiffTitleBody": 0, 
        "hasContractions": False, "has!" : False, "has?": False, "hasQuote": False, 
        "bodyPercProper": 0, "bodyPercAdv": 0, "bodyQuoteNum": 0, "lengthOfCaption": 0, "PosSentDifference": 0,
        "NeuSentDifference": 0, "NegSentDifference": 0, "percPersonal" : 0, "percPersonalBody": 0, 
        "hasThat": False, "startsWithThat": False, "hasThis": False, "hasMedia": False, "isRetweet": False, 
        "hasMention": False, "hasHashtag": False,
        "label": newsObject.truthClass
    }
    
    words = newsObject.postText[0].split(' ')
    keywords =  [x.lower() for x in newsObject.targetKeywords]
    text = nltk.word_tokenize(newsObject.postText[0])
    tokenizedList = nltk.pos_tag(text)
    twitterTokenized = tknz.tokenize(newsObject.postText[0])

    
    if len(newsObject.postMedia) > 0:
        feat['hasMedia'] = True
    
    if len(twitterTokenized) > 0:
        if twitterTokenized[0] == 'RT':
            feat['isRetweet'] = True
    
    for tWord in twitterTokenized:
        if tWord[0] == '@':
            feat['hasMention'] = True
        elif tWord[0] == '#':
            feat['hasHashtag'] = True
    
    feat['wordCount'] = len(words)
    
    if words[0].isdigit():
        feat['beginsNum'] = True
    elif words[0].lower() == 'this':
        feat['beginsThis'] = True
        feat['hasThis'] = True
    elif words[0].lower() == 'that':
        feat['startsWithThat'] = True
        feat['hasThat'] = True
    
    numSame = 0
    
    for word in words:
        if word.lower() in keywords:
            numSame += 1
        if word.lower() in contractions:
            feat['hasContractions'] = True
        elif word.lower() == 'this':
            feat['hasThis'] = True
        elif word.lower() == 'that':
            feat['hasThat'] = True
  
    numProper = 0
    numStop = 0
    numVerb = 0
    numNoun = 0
    numAdj = 0
    numAdv = 0
    numPersonal = 0
    
    if len(tokenizedList) > 0:
        firstWord = tokenizedList[0]
        if firstWord[1] == 'MD' or firstWord[1] == 'WRB':
            feat['beginsQuestion'] = True
    
    for partOfSpeech in tokenizedList:
        if partOfSpeech[1] == 'NNP':
            numProper += 1
        elif partOfSpeech[1] == 'VB' or partOfSpeech[1] == 'VBP' or partOfSpeech[1] == 'VBD' or partOfSpeech[1] == 'VBN':
            numVerb += 1
        elif partOfSpeech[1] == 'NN':
            numNoun += 1
        elif partOfSpeech[1] == 'PRP':
            feat['informal'] = True
            numPersonal += 1
        elif partOfSpeech[1] == 'JJ':
            numAdj += 1
        elif partOfSpeech[1] == 'RB':
            numAdv += 1
        elif partOfSpeech[1] == '.':
            if partOfSpeech[0] == '?':
                feat['has?'] = True
            elif partOfSpeech[0] == '!':
                feat['has!'] = True
        elif partOfSpeech[1] == "''" or partOfSpeech[1] == '""':
            feat['hasQuote'] = True
        if(partOfSpeech[0].lower() in stopWords):
            numStop += 1

    feat['titleStopPerc'] = round(numStop/feat['wordCount'], 2)
    feat['titleProperPerc'] = round(numProper/feat['wordCount'], 2)
    feat['titlePercVerbs'] = round(numVerb/feat['wordCount'], 2)
    feat['percPersonal'] = round(numPersonal/feat['wordCount'], 2)
    feat['percNouns'] = round(numNoun/feat['wordCount'], 2)
    feat['percAdj'] = round(numAdj/feat['wordCount'], 2)
    feat['percAdv'] = round(numAdv/feat['wordCount'], 2)
    feat['percentKeywordsInTitle'] = round(numSame/feat['wordCount'], 2)
    
    feat['unigrams'] = sum(Counter(ngrams(text,1)).values())
    feat['bigrams'] = sum(Counter(ngrams(text,2)).values())
    feat['trigrams'] = sum(Counter(ngrams(text,3)).values())
    feat['fourgrams'] = sum(Counter(ngrams(text,4)).values())
    feat['fivegrams'] = sum(Counter(ngrams(text,5)).values())
    
    sentence = newsObject.postText[0]
    ss = sid.polarity_scores(sentence)
    feat['posSent'] = ss['pos']
    feat['neuSent'] = ss['neu']
    feat['negSent'] = ss['neg']
    feat['compoundSent'] = ss['compound']
    
    paraSent = 0
    paraNegSent = 0
    paraNeuSent = 0
    paraPosSent = 0
    countPara = 0
    articleNumProp = 0
    articleNumAdv = 0
    numPersonalBody = 0 
    ss2 = {}
    for item in newsObject.targetParagraphs:
        feat['articleWords'] += len(item.split())
        text = nltk.word_tokenize(item)
        tokenizedList = nltk.pos_tag(text)
        feat['unigramsArticle'] += sum(Counter(ngrams(text,1)).values())
        feat['bigramsArticle'] += sum(Counter(ngrams(text,2)).values())
        feat['trigramsArticle'] += sum(Counter(ngrams(text,3)).values())
        feat['fourgramsArticle'] += sum(Counter(ngrams(text,4)).values())
        feat['fivegramsArticle'] += sum(Counter(ngrams(text,5)).values())
        ss2 = sid.polarity_scores(item)
        countPara += 1
        paraSent += ss2['compound']
        paraNegSent += ss2['neg']
        paraNeuSent += ss2['neu']
        paraPosSent += ss2['pos']
        for word in tokenizedList:
            if word[1] == 'NNP':
                articleNumProp += 1
            elif word[1] == 'RB':
                articleNumAdv += 1
            elif word[1] == 'PRP':
                numPersonalBody += 1
            elif word[1] == "''" or word[1] == '""':
                feat['bodyQuoteNum'] += 1
    
    if feat['articleWords'] > 0:
        feat['bodyPercProper'] = round(articleNumProp/feat['articleWords'], 2)
        feat['bodyPercAdv'] = round(articleNumAdv/feat['articleWords'], 2)
        feat['percPersonalBody'] = round(numPersonalBody/feat['articleWords'], 2)
    
    if countPara > 0:
        paraSent = paraSent/countPara
        paraNegSent = paraNegSent/countPara
        paraNeuSent = paraNeuSent/countPara
        paraPosSent = paraPosSent/countPara
    
    feat['sentDiffTitleBody'] = abs(feat['compoundSent'] - paraSent)
    feat['NegSentDifference'] = abs(feat['negSent'] - paraNegSent)
    feat['NeuSentDifference'] = abs(feat['neuSent'] - paraNeuSent)
    feat['PosSentDifference'] = abs(feat['posSent'] - paraPosSent)



    
    for item in newsObject.targetCaptions:
        feat['lengthOfCaption'] += len(item)
        
        
    featTuple = (feat['wordCount'], feat['informal'], feat['beginsQuestion'], feat['beginsNum'], 
            feat['beginsThis'], feat['titleStopPerc'], feat['titleProperPerc'], 
            feat['posSent'], feat['neuSent'], feat['negSent'], feat['compoundSent'], feat['articleWords'], 
            feat['titlePercVerbs'], feat['unigrams'], feat['bigrams'], feat['trigrams'], feat['fourgrams'], 
            feat['fivegrams'], feat['percNouns'], feat['unigramsArticle'],
            feat['bigramsArticle'], feat['trigramsArticle'], feat['fourgramsArticle'], feat['fivegramsArticle'], 
            feat['percAdj'], feat['percAdv'], feat['percentKeywordsInTitle'], 
            feat['sentDiffTitleBody'], feat['hasContractions'], feat['has!'], feat['has?'], feat['hasQuote'],
            feat['bodyPercProper'], feat['bodyPercAdv'], feat['bodyQuoteNum'], 
            feat['lengthOfCaption'], feat['PosSentDifference'], feat['NeuSentDifference'],
            feat['NegSentDifference'],feat['percPersonal'], feat['percPersonalBody'],
            feat['hasThat'], feat['startsWithThat'], feat['hasThis'], feat['hasMedia'], feat['isRetweet'],
            feat['hasMention'], feat['hasHashtag'],
            feat['label'])
    newsObject.attributes = featTuple
    return newsObject
            

In [None]:
for item in instances:
    item = extractFeatures(item)



In [None]:
### dump to arff
features = [("word count", 'NUMERIC'),
            ("contains informal pronouns", ['True', 'False']),
            ("Begins w/ question word", ['True', 'False']),
            ("Begins w/ number", ['True', 'False']),
            ("Begins with 'this'", ['True', 'False']),
            ("percent stop words", 'NUMERIC'),
            ("Percent proper nouns", 'NUMERIC'),
            ("Pos sent", 'NUMERIC'),
            ("Neu sent", 'NUMERIC'),
            ("Neg sent", 'NUMERIC'),
            ("Compound sent", 'NUMERIC'),
            ("Article Length", 'NUMERIC'),
            ("Percent verbs", 'NUMERIC'),
            ("Unigrams", 'NUMERIC'),
            ("Bigrams", 'NUMERIC'),
            ("Trigrams", 'NUMERIC'),
            ("Fourgrams", 'NUMERIC'),
            ("Fivegrams", 'NUMERIC'),
            ("Percent nouns", 'NUMERIC'),
            ("Unigrams article body", 'NUMERIC'),
            ("Bigrams article body", 'NUMERIC'),
            ("Trigrams article body", 'NUMERIC'),
            ("Fourgrams article body", 'NUMERIC'),
            ("Fivegrams article body", 'NUMERIC'),
            ("Percent adj", 'NUMERIC'),
            ("Percent adv", 'NUMERIC'),
            ("Percent keywords in title", 'NUMERIC'),
            ("Difference in sent body v. title", 'NUMERIC'),
            ('Has contractions', ['True', 'False']),
            ('Has exclamation', ['True', 'False']),
            ("Has question", ['True', 'False']),
            ('Title has quote', ['True','False']),
            ("Body percent proper", 'NUMERIC'),
            ("Body percent adv", 'NUMERIC'),
            ("Body num quotes", 'NUMERIC'),
            ("Length Of Caption", 'NUMERIC'),
            ("Pos Sent difference", 'NUMERIC'),
            ("Neu sent difference", 'NUMERIC'),
            ("Neg sent difference", 'NUMERIC'),
            ("Percent personal pronouns", 'NUMERIC'),
            ("Percent perosnal pronouns article", 'NUMERIC'),
            ("Has that", ['True', 'False']),
            ('Starts with that', ['True', 'False']),
            ('Has this', ['True', 'False']),
            ("Has media", ['True', 'False']),
            ("Is Retweet", ['True', 'False']),
            ("Has mention", ['True', 'False']),
            ("Has Hashtag", ['True', 'False']),
            ("label", ['0', '1'])]
data = {}
data.setdefault('attributes', features)
data.setdefault('description', '')
data.setdefault('relation', 'clickbait_sample')
data.setdefault('data', [])
for item in instances:
    if item.attributes[48] == '1':
        print(item.postText[0])
        print(item.attributes)
    data['data'].append(item.attributes)

with open('sample_train.arff', 'w') as f:
    f.write(arff.dumps(data))

In [8]:
client = Streamer(**oauth)
results = twitter.cursor(twitter.search, q='#BlackLivesMatter')
print(result[0])

NameError: name 'twitter' is not defined

In [None]:
str1 = "apple"
str1[0]