## Common imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load train and evaluation into one dataframe

In [2]:
train_data = pd.read_pickle('pickle_files/eval_data_formatted.pickle')
eval_data = pd.read_pickle('pickle_files/train_data_formatted.pickle')
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)


## Load stopwords

In [3]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [4]:
all_data['Tweet'] = all_data['Tweet'].str.lower() #lowercase the text 

## Remove punctuation

In [5]:
p = re.compile(r'[^\w\s]+')
all_data['Tweet'] = [p.sub('', x) for x in all_data['Tweet'].tolist()] #remove the punctuation
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Remove stopwords

In [8]:
all_data['Tweet'] = all_data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Tokenize

In [7]:
all_data['Tweet'] = all_data['Tweet'].apply(word_tokenize) #tokenize the text
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Stem the tokens

In [8]:
ps = PorterStemmer()

for i in all_data.index:
    for w in range(0,len(all_data.Tweet[i])):
        all_data.Tweet[i][w] = ps.stem(all_data.Tweet[i][w])

print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## reconstruct from tokens

In [9]:
for i in all_data.index:
    all_data.Tweet[i] = ' '.join(all_data.Tweet[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Function for finding char ngrams

In [10]:
def create_ngrams(prog, folder, n_gram_size,file):

    tweet_ids = [] #initialise list of tweet ids
    n_grams = [] #initialise list of n_grams

    n = n_gram_size

    progress_dumps = prog #memory becomes a problem with this code.  Set a value n whereby the processed lists of tweets will be saved
                         #to disk every n tweets and memory will be cleared***
    location = folder + '*.pickle'
    print(folder)
    x = glob.glob(folder)
    print(len(x))
    if len(x)<2:
        for i in all_data.index:

            tweet_id = all_data['Tweet ID'][i]
            all_data.Tweet[i] = all_data.Tweet[i].replace(" ", "_")            #replace whitespace with underscore
            if len(all_data.Tweet[i])==0:
                n_grams = np.append(n_grams,'zzzz')                                 #if there are no tokens add "zzzz" for this tweet
                tweet_ids = np.append(tweet_ids,tweet_id)
            else:
                for c in range(0,len(all_data.Tweet[i])-n+1):                           #iterate along the length of each token
                    n_grams = np.append(n_grams,all_data.Tweet[i][c:c+n])               #add n_gram to list
                    tweet_ids = np.append(tweet_ids,tweet_id)
            if (i//progress_dumps==i/progress_dumps):                       #save progress and clear memory
                filename = folder + 'progress_' + str(i)+'.pickle'          #set folder and filename***
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})                       #create dataframe
                df.to_pickle(filename)                                      #save
                del df                                                      #clear memory
                tweet_ids = []                                              #reinitialise list of tweet ids
                n_grams = []                                                #reinitialise list of n_grams
                print(i)                                                    #print progress

            if (i==max(all_data.index)):                                          #as above but for the last chunk of tweets
                filename = folder + 'progress_' + str(i)+'.pickle'
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})
                df.to_pickle(filename)
                del df
                tweet_ids = []
                n_grams = [] 
                print(i)
    else:
        print("Files already exist in this folder.")

## Find and save char ngrams

In [11]:
progs = [500,500,500,500]
folders = ['features/tweet_char_4grams/','features/tweet_char_3grams/','features/tweet_char_2grams/','features/tweet_char_1grams/']
n_gram_sizes = [4,3,2,1]

for prog, folder, n_gram_size in zip(progs, folders, n_gram_sizes):
    print('n_gram_size = ' + str(n_gram_size))
    create_ngrams(prog, folder, n_gram_size, all_data)

n_gram_size = 4
features/tweet_char_4grams/
1
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 3
features/tweet_char_3grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 2
features/tweet_char_2grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 1
features/tweet_char_1grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
