## Common imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load train and evaluation into one dataframe

In [2]:
train_data = pd.read_pickle(r'C:\Users\scott\Dissertation\data_sort\pickle_files\eval_data_formatted.pickle')
eval_data = pd.read_pickle(r'C:\Users\scott\Dissertation\data_sort\pickle_files\train_data_formatted.pickle')
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)


## Load stopwords

In [3]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [4]:
all_data['Bio'] = all_data['Bio'].str.lower() #lowercase the text 

## Remove punctuation, usernames, hashtags, URLs

In [5]:
all_data['Bio'] = all_data['Bio'].fillna('')
p = re.compile(r'[^\w\s]+')
all_data['Bio'] = [p.sub('', x) for x in all_data['Bio'].tolist()] #remove the punctuation
for i in all_data.index:
    #print(i)
    #all_data.loc[i,'Bio'] =re.sub('[^A-Za-z0-9]+',"",all_data.loc[i,'Bio'])
    all_data.loc[i,'Bio'] =re.sub("@[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Bio'])
    all_data.loc[i,'Bio'] =re.sub("http[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Bio'])
    all_data.loc[i,'Bio'] =re.sub("#[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Bio'])
print(len(np.unique(all_data['Tweet ID']))) #for convenience

6297


In [6]:
all_data.Bio

0                                                        
1       official twitter page of the press and journal...
2                         ens volem viveslliures i rebels
3       up above the streets and houses rainbow climbi...
4       conscript all remainers into the eu army engli...
5       edinburgh ward 8 councillor working hard to ma...
6       its unrealistic to expect me to act like an ad...
7       the best part of me ended up a dirty stain on ...
8                                                la  wwfc
9        cofounder of macrebur  the plastic roads company
10      scotlands national newspaper see also scotsman...
11                                                       
12      proud to be english and a yorkshireman a belie...
13                                                    meh
14      a veteran of societey a believer in ethics and...
15                      freethinking 70 year old hedonist
16      protomaker 3d printer   political tweets 3d pr...
17            

## Remove stopwords

In [7]:
all_data['Bio'] = all_data['Bio'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(all_data['Tweet ID']))) #for convenience

6297


## Tokenize

In [8]:
all_data['Bio'] = all_data['Bio'].apply(word_tokenize) #tokenize the text
print(len(np.unique(all_data['Tweet ID']))) #for convenience

6297


## Function for finding word ngrams

In [30]:
def create_ngrams(prog, folder, n_gram_size,file):

    tweet_ids = [] #initialise list of tweet ids
    n_grams = [] #initialise list of n_grams



    progress_dumps = prog #memory becomes a problem with this code.  Set a value n whereby the processed lists of tweets will be saved
                         #to disk every n tweets and memory will be cleared***
    location = folder + '*.pickle'
    print(folder)
    x = glob.glob(folder)
    print(len(x))
    if len(x)<2:
        for i in all_data.index:

            tweet_id = all_data['Tweet ID'][i]                                  #get the tweet id
            tokens = all_data.Bio[i]                                          #get the associated tokens
            n=n_gram_size                                                   #set the n_gram length***
            if len(all_data.Bio[i])==0:
                n_grams = np.append(n_grams,'zzzz')                                 #if there are no tokens add "zzzz" for this tweet
                tweet_ids = np.append(tweet_ids,tweet_id)
            else:
                for c in range(0,len(all_data.Bio[i])-n+1):
                    n_gram = ' '.join(all_data.Bio[i][c:c+n])
                    n_grams = np.append(n_grams,n_gram)               #add n_gram to list
                    tweet_ids = np.append(tweet_ids,tweet_id) 

            if (i//progress_dumps==i/progress_dumps):                       #save progress and clear memory
                filename = folder + 'progress_' + str(i)+'.pickle'          #set folder and filename***
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})                       #create dataframe
                df.to_pickle(filename)                                      #save
                del df                                                      #clear memory
                tweet_ids = []                                              #reinitialise list of tweet ids
                n_grams = []                                                #reinitialise list of n_grams
                print(i)                                                    #print progress

            if (i==max(all_data.index)):                                          #as above but for the last chunk of tweets
                filename = folder + 'progress_' + str(i)+'.pickle'
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})
                df.to_pickle(filename)
                del df
                tweet_ids = []
                n_grams = [] 
                print(i)
    else:
        print("Files already exist in this folder.")

## Find and save char ngrams

In [31]:
progs = [500,500,500,500]
folders = ['bio_word_4grams/','bio_word_3grams/','bio_word_2grams/','bio_word_1grams/']
n_gram_sizes = [4,3,2,1]

for prog, folder, n_gram_size in zip(progs, folders, n_gram_sizes):
    print('n_gram_size = ' + str(n_gram_size))
    create_ngrams(prog, folder, n_gram_size, all_data)

n_gram_size = 4
bio_word_4grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6296
n_gram_size = 3
bio_word_3grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6296
n_gram_size = 2
bio_word_2grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6296
n_gram_size = 1
bio_word_1grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6296


In [34]:
x = pd.read_pickle('bio_word_2grams/progress_2500.pickle')
print(x.head(20))

             n_gram             tweet_id
0         im trevor  1150454531282690048
1     trevor kingof  1150454531282690048
2     kingof winner  1150454531282690048
3      winner maker  1150454531282690048
4      maker things  1150454531282690048
5       things make  1150454531282690048
6       make people  1150454531282690048
7      people happy  1150454531282690048
8         happy dms  1150454531282690048
9      dms messages  1150454531282690048
10  messages always  1150454531282690048
11   always welcome  1150454531282690048
12     welcome line  1150454531282690048
13          line id  1150454531282690048
14  id bloozsingahh  1150454531282690048
15        radio tay  1153658341295632384
16         tay news  1153658341295632384
17    news provides  1153658341295632384
18    provides news  1153658341295632384
19     news matters  1153658341295632384
