## Common imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load train and evaluation into one dataframe

In [2]:
train_data = pd.read_pickle('pickle_files/eval_data_formatted.pickle')
eval_data = pd.read_pickle('pickle_files/train_data_formatted.pickle')
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)


## Load stopwords

In [3]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [4]:
all_data['Tweet'] = all_data['Tweet'].str.lower() #lowercase the text 

## Remove punctuation, usernames, hashtags, URLs

In [5]:
all_data['Tweet'] = all_data['Tweet'].fillna('')
p = re.compile(r'[^\w\s]+')
all_data['Tweet'] = [p.sub('', x) for x in all_data['Tweet'].tolist()] #remove the punctuation
for i in all_data.index:
    #print(i)
    #all_data.loc[i,'Bio'] =re.sub('[^A-Za-z0-9]+',"",all_data.loc[i,'Bio'])
    all_data.loc[i,'Tweet'] =re.sub("@[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("http[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("#[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


In [6]:
all_data.Tweet

0       matt is this the same nicola sturgeon thats be...
1            you know nothing of nicola sturgeon atkinson
2       sturgeon letter to new pm demands essential al...
3       scotland had a ref and lost they need permissi...
4               nicola sturgeon depressing numpty go away
5                      i actually despise nicola sturgeon
6       what brexit means for scotland a qampa with fi...
7                           leo sayer and nicola sturgeon
8                                      so vile eh\n\n\n\n
9       yes but when calling someone far right for act...
10      virtue signalling reaches a new high try solvi...
11      and for some reason a heavily pregnant nicola ...
12      that boot sturgeon really needs fucked in the ...
13      the just get in from nicola sturgeon at the en...
14      sturgeon we may need to accelerate indyref2 plans
15      looking forward to next weeks instalment\n\nal...
16      if only youd turn your gaze onto what is happe...
17      heres 

## Remove stopwords

In [7]:
all_data['Tweet'] = all_data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Tokenize

In [8]:
all_data['Tweet'] = all_data['Tweet'].apply(word_tokenize) #tokenize the text
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Stem the tokens

In [9]:
ps = PorterStemmer()

for i in all_data.index:
    for w in range(0,len(all_data.Tweet[i])):
        all_data.Tweet[i][w] = ps.stem(all_data.Tweet[i][w])

print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Function for finding word ngrams

In [10]:
def create_ngrams(prog, folder, n_gram_size,file):

    tweet_ids = [] #initialise list of tweet ids
    n_grams = [] #initialise list of n_grams



    progress_dumps = prog #memory becomes a problem with this code.  Set a value n whereby the processed lists of tweets will be saved
                         #to disk every n tweets and memory will be cleared***
    location = folder + '*.pickle'
    print(folder)
    x = glob.glob(folder)
    print(len(x))
    if len(x)<2:
        for i in all_data.index:

            tweet_id = all_data['Tweet ID'][i]                                  #get the tweet id
            tokens = all_data.Tweet[i]                                          #get the associated tokens
            n=n_gram_size                                                   #set the n_gram length***
            if len(all_data.Tweet[i])==0:
                n_grams = np.append(n_grams,'zzzz')                                 #if there are no tokens add "zzzz" for this tweet
                tweet_ids = np.append(tweet_ids,tweet_id)
            else:
                for c in range(0,len(all_data.Tweet[i])-n+1):
                    n_gram = ' '.join(all_data.Tweet[i][c:c+n])
                    n_grams = np.append(n_grams,n_gram)               #add n_gram to list
                    tweet_ids = np.append(tweet_ids,tweet_id) 

            if (i//progress_dumps==i/progress_dumps):                       #save progress and clear memory
                filename = folder + 'progress_' + str(i)+'.pickle'          #set folder and filename***
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})                       #create dataframe
                df.to_pickle(filename)                                      #save
                del df                                                      #clear memory
                tweet_ids = []                                              #reinitialise list of tweet ids
                n_grams = []                                                #reinitialise list of n_grams
                print(i)                                                    #print progress

            if (i==max(all_data.index)):                                          #as above but for the last chunk of tweets
                filename = folder + 'progress_' + str(i)+'.pickle'
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})
                df.to_pickle(filename)
                del df
                tweet_ids = []
                n_grams = [] 
                print(i)
    else:
        print("Files already exist in this folder.")

## Find and save word ngrams

In [11]:
progs = [500,500,500,500]
folders = ['features/tweet_word_4grams/','features/tweet_word_3grams/','features/tweet_word_2grams/','features/tweet_word_1grams/']
n_gram_sizes = [4,3,2,1]

for prog, folder, n_gram_size in zip(progs, folders, n_gram_sizes):
    print('n_gram_size = ' + str(n_gram_size))
    create_ngrams(prog, folder, n_gram_size, all_data)

n_gram_size = 4
features/tweet_word_4grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 3
features/tweet_word_3grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 2
features/tweet_word_2grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 1
features/tweet_word_1grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103


In [12]:
x = pd.read_pickle('features/tweet_word_3grams/progress_3500.pickle')
print(x.head(20))

                      n_gram             tweet_id
0         your argu independ  1148665559325794305
1       argu independ nicola  1148665559325794305
2   independ nicola sturgeon  1148665559325794305
3       nicola sturgeon sinc  1148665559325794305
4         sturgeon sinc that  1148665559325794305
5           sinc that speech  1148665559325794305
6          not meet ordinari  1155940342098644993
7         meet ordinari scot  1155940342098644993
8        ordinari scot visit  1155940342098644993
9             scot visit big  1155940342098644993
10          visit big fearti  1155940342098644993
11       big fearti unfortun  1155940342098644993
12       fearti unfortun ive  1155940342098644993
13          unfortun ive got  1155940342098644993
14             ive got heavi  1155940342098644993
15           got heavi timet  1155940342098644993
16           heavi timet ive  1155940342098644993
17             timet ive got  1155940342098644993
18                ive got go  1155940342098644993
