In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("../../data/processed/twitter16_dataset_with_tvt.csv", lineterminator="\n")
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2
0,656955120626880512,correct predictions in back to the future ii U...,false,training,1,training,validation
1,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...,true,training,3,training,training
2,613404935003217920,cops bought the alleged church shooter burger ...,false,training,2,test,training
3,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,3,test,training
4,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,training


In [2]:
labels_str = data['label'].unique().tolist()
labels_str

['false', 'true', 'unverified', 'non-rumor']

In [3]:
labels = data['label'].tolist()
labels[:10]

['false',
 'true',
 'false',
 'unverified',
 'unverified',
 'true',
 'unverified',
 'non-rumor',
 'non-rumor',
 'false']

In [4]:
raw_texts = data['tweet_text'].tolist()
raw_texts[:10]

['correct predictions in back to the future ii URL\r',
 ".@whitehouse in rainbow colors for #scotusmarriage? here's what i think about that decision: URL\r",
 'cops bought the alleged church shooter burger king hours after killings URL\r',
 '🔥ca kkk grand wizard 🔥 endorses @hillaryclinton #neverhillary #trump2016 URL\r',
 'an open letter to trump voters from his top strategist-turned-defector URL via @xojanedotcom\r',
 'god put a rainbow over the white house 🌈 URL\r',
 'craigslist ad: ‘get paid $15 an hour to protest at the trump rally’ - URL URL\r',
 '#quasimodo: dog with short spine has big heart URL URL\r',
 'need to #getcovered? most people can find a health insurance plan for as little as $75/mo: URL URL\r',
 '#wakeupamerica🇺🇸 who needs a #gun registry when #obama has all your personal information URL URL\r']

In [5]:
import string
import nltk
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(reduce_len=True)
texts = [tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8')) for text in raw_texts]
texts = [[t for t in text if t not in string.punctuation] for text in texts]
texts = [[t for t in text if t not in ['URL', '‘', '’']] for text in texts]

texts[:10]

[['correct', 'predictions', 'in', 'back', 'to', 'the', 'future', 'ii'],
 ['@whitehouse',
  'in',
  'rainbow',
  'colors',
  'for',
  '#scotusmarriage',
  "here's",
  'what',
  'i',
  'think',
  'about',
  'that',
  'decision'],
 ['cops',
  'bought',
  'the',
  'alleged',
  'church',
  'shooter',
  'burger',
  'king',
  'hours',
  'after',
  'killings'],
 ['ca',
  'kkk',
  'grand',
  'wizard',
  'endorses',
  '@hillaryclinton',
  '#neverhillary',
  '#trump2016'],
 ['an',
  'open',
  'letter',
  'to',
  'trump',
  'voters',
  'from',
  'his',
  'top',
  'strategist-turned-defector',
  'via',
  '@xojanedotcom'],
 ['god', 'put', 'a', 'rainbow', 'over', 'the', 'white', 'house'],
 ['craigslist',
  'ad',
  'get',
  'paid',
  '15',
  'an',
  'hour',
  'to',
  'protest',
  'at',
  'the',
  'trump',
  'rally'],
 ['#quasimodo', 'dog', 'with', 'short', 'spine', 'has', 'big', 'heart'],
 ['need',
  'to',
  '#getcovered',
  'most',
  'people',
  'can',
  'find',
  'a',
  'health',
  'insurance',
  'p

In [6]:
bigram_list = {}

for i, text in enumerate(texts):
    bigrms = nltk.bigrams(text)
    bigrms = map(' '.join, bigrms)
    
    for bgr in bigrms:
        if bgr not in bigram_list:
            bigram_list[bgr] = {label:0 for label in labels_str}
        
        bigram_list[bgr][labels[i]] += 1
        
print(f"Bigram Counts : {len(bigram_list)}")

Bigram Counts : 7710


In [7]:
bigram_df = pd.DataFrame.from_dict(bigram_list).transpose()
# bigram_df.columns = ['grams', 'false', 'true', 'unverified', 'non-rumor']
bigram_df.head()

Unnamed: 0,false,true,unverified,non-rumor
correct predictions,1,0,0,0
predictions in,1,0,0,0
in back,1,0,0,1
back to,6,0,0,0
to the,7,2,1,2


In [8]:
bigram_df.nlargest(10, 'true')

Unnamed: 0,false,true,unverified,non-rumor
white house,1,39,0,0
rainbow colors,0,20,0,0
the white,0,19,0,0
in rainbow,0,16,0,0
sydney cafe,0,12,0,0
in paris,1,11,0,1
a rainbow,0,10,0,0
parliament hill,0,9,0,0
in ottawa,0,9,0,0
of the,2,8,8,6


In [9]:
bigram_df.nlargest(10, 'false')

Unnamed: 0,false,true,unverified,non-rumor
mass shootings,30,0,0,0
steve jobs,17,0,0,0
a syrian,15,1,0,0
biological father,14,0,0,0
father was,14,0,0,0
jobs was,12,0,0,0
was adopted,12,0,0,0
adopted his,12,0,0,0
his biological,12,0,0,0
was abdulfattah,12,0,0,0


In [10]:
trigram_list = {}

for i, text in enumerate(texts):
    trigrms = nltk.trigrams(text)
    trigrms = map(' '.join, trigrms)
    
    for tgr in trigrms:
        if tgr not in trigram_list:
            trigram_list[tgr] = {label:0 for label in labels_str}
        
        trigram_list[tgr][labels[i]] += 1
        
print(f"Trigram Counts : {len(trigram_list)}")

Trigram Counts : 8220


In [11]:
trigram_df = pd.DataFrame.from_dict(trigram_list).transpose()
# trigram_df.columns = ['grams', 'false', 'true', 'unverified', 'non-rumor']
trigram_df.head()

Unnamed: 0,false,true,unverified,non-rumor
correct predictions in,1,0,0,0
predictions in back,1,0,0,0
in back to,1,0,0,0
back to the,6,0,0,0
to the future,6,0,0,0


In [12]:
trigram_df.nlargest(10, 'true')

Unnamed: 0,false,true,unverified,non-rumor
the white house,0,19,0,0
in rainbow colors,0,15,0,0
white house lit,0,8,0,0
house lit in,0,8,0,0
lit in rainbow,0,8,0,0
rainbow colors to,0,6,0,0
supermarket in paris,0,5,0,0
cpl nathan cirillo,0,5,0,0
white house is,0,5,0,0
supreme court ruling,0,5,0,0


In [13]:
trigram_df.nlargest(10, 'false')

Unnamed: 0,false,true,unverified,non-rumor
biological father was,14,0,0,0
steve jobs was,12,0,0,0
jobs was adopted,12,0,0,0
was adopted his,12,0,0,0
adopted his biological,12,0,0,0
his biological father,12,0,0,0
father was abdulfattah,12,0,0,0
was abdulfattah jandali,12,0,0,0
abdulfattah jandali a,12,0,0,0
jandali a syrian,12,0,0,0


In [14]:
bigram_df.columns

Index(['false', 'true', 'unverified', 'non-rumor'], dtype='object')

In [17]:
writer = pd.ExcelWriter('../../data/processed/twitter16_ngram_distribution.xlsx', engine='xlsxwriter')

bigram_df.to_excel(writer, sheet_name='bigram')
trigram_df.to_excel(writer, sheet_name='trigram')

writer.save()