In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("../../data/processed/twitter15_dataset_with_tvt.csv", lineterminator="\n")
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,1,training,training
1,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,training
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,training,2,training,training
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,training,1,training,training
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,training,3,training,training


In [2]:
labels_str = data['label'].unique().tolist()
labels_str

['unverified', 'non-rumor', 'true', 'false']

In [3]:
labels = data['label'].tolist()
labels[:10]

['unverified',
 'unverified',
 'non-rumor',
 'non-rumor',
 'true',
 'non-rumor',
 'unverified',
 'true',
 'unverified',
 'false']

In [4]:
raw_texts = data['tweet_text'].tolist()
raw_texts[:10]

['🔥ca kkk grand wizard 🔥 endorses @hillaryclinton #neverhillary #trump2016 URL\r',
 'an open letter to trump voters from his top strategist-turned-defector URL via @xojanedotcom\r',
 'america is a nation of second chances —@potus on new reforms to solitary confinement: URL URL\r',
 'brandon marshall visits and offers advice, support to brother of fallen hero zaevion dobson: URL URL\r',
 'rip elly may clampett: so sad to learn #beverlyhillbillies star donna douglas has passed away. URL\r',
 'former 3 doors down guitarist matt roberts has died at age 38, according to his father. URL URL\r',
 'craigslist ad: ‘get paid $15 an hour to protest at the trump rally’ - URL URL\r',
 'just in: missing afghan soldiers found trying to enter canada near niagara falls URL URL\r',
 'the day #ferguson cops told a dirty, bloody lie (via @thedailybeast): URL URL\r',
 "#riphulkhogan my heart is ripping like your shirt. wwe'll miss you.\r"]

In [5]:
import string
import nltk
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(reduce_len=True)
texts = [tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8')) for text in raw_texts]
texts = [[t for t in text if t not in string.punctuation] for text in texts]
texts = [[t for t in text if t not in ['URL', '‘', '’']] for text in texts]

texts[:10]

[['ca',
  'kkk',
  'grand',
  'wizard',
  'endorses',
  '@hillaryclinton',
  '#neverhillary',
  '#trump2016'],
 ['an',
  'open',
  'letter',
  'to',
  'trump',
  'voters',
  'from',
  'his',
  'top',
  'strategist-turned-defector',
  'via',
  '@xojanedotcom'],
 ['america',
  'is',
  'a',
  'nation',
  'of',
  'second',
  'chances',
  '@potus',
  'on',
  'new',
  'reforms',
  'to',
  'solitary',
  'confinement'],
 ['brandon',
  'marshall',
  'visits',
  'and',
  'offers',
  'advice',
  'support',
  'to',
  'brother',
  'of',
  'fallen',
  'hero',
  'zaevion',
  'dobson'],
 ['rip',
  'elly',
  'may',
  'clampett',
  'so',
  'sad',
  'to',
  'learn',
  '#beverlyhillbillies',
  'star',
  'donna',
  'douglas',
  'has',
  'passed',
  'away'],
 ['former',
  '3',
  'doors',
  'down',
  'guitarist',
  'matt',
  'roberts',
  'has',
  'died',
  'at',
  'age',
  '38',
  'according',
  'to',
  'his',
  'father'],
 ['craigslist',
  'ad',
  'get',
  'paid',
  '15',
  'an',
  'hour',
  'to',
  'protes

In [6]:
bigram_list = {}

for i, text in enumerate(texts):
    bigrms = nltk.bigrams(text)
    bigrms = map(' '.join, bigrms)
    
    for bgr in bigrms:
        if bgr not in bigram_list:
            bigram_list[bgr] = {label:0 for label in labels_str}
        
        bigram_list[bgr][labels[i]] += 1
        
print(f"Bigram Counts : {len(bigram_list)}")

Bigram Counts : 13842


In [7]:
bigram_df = pd.DataFrame.from_dict(bigram_list).transpose()
# bigram_df.columns = ['grams', 'false', 'true', 'unverified', 'non-rumor']
bigram_df.head()

Unnamed: 0,unverified,non-rumor,true,false
ca kkk,1,0,0,0
kkk grand,1,0,0,0
grand wizard,1,0,0,0
wizard endorses,1,0,0,0
endorses @hillaryclinton,1,0,0,0


In [8]:
bigram_df.nlargest(10, 'true')

Unnamed: 0,unverified,non-rumor,true,false
paul walker,0,0,54,0
war memorial,2,0,38,0
paul walker's,0,0,24,0
r i,0,0,21,0
i p,0,0,21,0
with paul,0,0,21,0
shot at,1,0,20,0
at the,3,8,18,1
soldier shot,2,0,18,0
died with,0,0,18,0


In [9]:
bigram_df.nlargest(10, 'false')

Unnamed: 0,unverified,non-rumor,true,false
shot down,1,0,0,26
malaysia airlines,0,0,0,16
darren wilson,3,0,1,13
iphone 6,0,0,0,13
out of,4,2,1,11
amber alert,0,0,0,11
list of,0,0,0,9
talking angela,0,0,0,9
6 plus,0,0,0,9
chick-fil-a manager,0,0,0,8


In [10]:
trigram_list = {}

for i, text in enumerate(texts):
    trigrms = nltk.trigrams(text)
    trigrms = map(' '.join, trigrms)
    
    for tgr in trigrms:
        if tgr not in trigram_list:
            trigram_list[tgr] = {label:0 for label in labels_str}
        
        trigram_list[tgr][labels[i]] += 1
        
print(f"Trigram Counts : {len(trigram_list)}")

Trigram Counts : 15399


In [11]:
trigram_df = pd.DataFrame.from_dict(trigram_list).transpose()
# trigram_df.columns = ['grams', 'false', 'true', 'unverified', 'non-rumor']
trigram_df.head()

Unnamed: 0,unverified,non-rumor,true,false
ca kkk grand,1,0,0,0
kkk grand wizard,1,0,0,0
grand wizard endorses,1,0,0,0
wizard endorses @hillaryclinton,1,0,0,0
endorses @hillaryclinton #neverhillary,1,0,0,0


In [12]:
trigram_df.nlargest(10, 'true')

Unnamed: 0,unverified,non-rumor,true,false
r i p,0,0,21,0
with paul walker,0,0,21,0
died with paul,0,0,17,0
soldier shot at,1,0,16,0
national war memorial,0,0,14,0
paul walker's death,0,0,13,0
paul walker that,0,0,13,0
war memorial in,0,0,12,0
to the driver,0,0,12,0
at war memorial,0,0,11,0


In [13]:
trigram_df.nlargest(10, 'false')

Unnamed: 0,unverified,non-rumor,true,false
iphone 6 plus,0,0,0,9
amber alert website,0,0,0,7
malaysia airlines plane,0,0,0,7
was shot down,0,0,0,7
running out of,0,0,0,7
out of chocolate,0,0,0,7
largest chocolate manufacturer,0,0,0,7
penis on a,0,0,0,6
islamic tribunal in,0,0,0,5
someone spray painted,0,0,0,5


In [14]:
bigram_df.columns

Index(['unverified', 'non-rumor', 'true', 'false'], dtype='object')

In [15]:
writer = pd.ExcelWriter('../../data/processed/twitter15_ngram_distribution.xlsx', engine='xlsxwriter')

bigram_df.to_excel(writer, sheet_name='bigram')
trigram_df.to_excel(writer, sheet_name='trigram')

writer.save()