## Common imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load train and evaluation into one dataframe

In [3]:
train_data = pd.read_pickle('pickle_files/eval_data_formatted.pickle')
eval_data = pd.read_pickle('pickle_files/train_data_formatted.pickle')
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)


## Load stopwords

In [4]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [5]:
all_data['Bio'] = all_data['Bio'].str.lower() #lowercase the text 

## Remove punctuation, usernames, hashtags, URLs

In [6]:
all_data['Bio'] = all_data['Bio'].fillna('')
p = re.compile(r'[^\w\s]+')
all_data['Bio'] = [p.sub('', x) for x in all_data['Bio'].tolist()] #remove the punctuation
for i in all_data.index:
    #print(i)
    #all_data.loc[i,'Bio'] =re.sub('[^A-Za-z0-9]+',"",all_data.loc[i,'Bio'])
    all_data.loc[i,'Bio'] =re.sub("@[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Bio'])
    all_data.loc[i,'Bio'] =re.sub("http[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Bio'])
    all_data.loc[i,'Bio'] =re.sub("#[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Bio'])
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


In [7]:
all_data.Bio

0       here there and everywhere\n\nrtw   doesnt nece...
1       people may forget what you said they may forge...
2                          official account uk today news
3       an economist interested in politics technology...
4                                                        
5                                                     tbc
6       cms contentmanagement smallbiz entrepreneur bu...
7                                                        
8       eu citizen  lets rid the world of the butchers...
9       learn to listen then listen to learn\nstudying...
10      old fashioned soul who simply wants the best f...
11                                      straight to video
12      rangersfc advocate of animal welfare loathe th...
13                                  singlecell paramecium
14      bringing the news unbiased from scotland for a...
15      born under a union jack  if you like random ou...
16                                        scottishbritish
17      ex rn 

## Remove stopwords

In [8]:
all_data['Bio'] = all_data['Bio'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Tokenize

In [9]:
all_data['Bio'] = all_data['Bio'].apply(word_tokenize) #tokenize the text
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Stem the tokens

In [10]:
ps = PorterStemmer()

for i in all_data.index:
    for w in range(0,len(all_data.Bio[i])):
        all_data.Bio[i][w] = ps.stem(all_data.Bio[i][w])

print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## reconstruct from tokens

In [11]:
for i in all_data.index:
    all_data.Bio[i] = ' '.join(all_data.Bio[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Function for finding char ngrams

In [12]:
def create_ngrams(prog, folder, n_gram_size,file):
    
    n = n_gram_size
    tweet_ids = [] #initialise list of tweet ids
    n_grams = [] #initialise list of n_grams



    progress_dumps = prog #memory becomes a problem with this code.  Set a value n whereby the processed lists of tweets will be saved
                         #to disk every n tweets and memory will be cleared***
    location = folder + '*.pickle'
    print(folder)
    x = glob.glob(folder)
    print(len(x))
    if len(x)<2:
        for i in all_data.index:

            tweet_id = all_data['Tweet ID'][i]
            all_data.Bio[i] = all_data.Bio[i].replace(" ", "_")            #replace whitespace with underscore
            if len(all_data.Bio[i])==0:
                n_grams = np.append(n_grams,'zzzz')                                 #if there are no tokens add "zzzz" for this tweet
                tweet_ids = np.append(tweet_ids,tweet_id)
            else:
                for c in range(0,len(all_data.Bio[i])-n+1):                           #iterate along the length of each token
                    n_grams = np.append(n_grams,all_data.Bio[i][c:c+n])               #add n_gram to list
                    tweet_ids = np.append(tweet_ids,tweet_id)
            if (i//progress_dumps==i/progress_dumps):                       #save progress and clear memory
                filename = folder + 'progress_' + str(i)+'.pickle'          #set folder and filename***
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})                       #create dataframe
                df.to_pickle(filename)                                      #save
                del df                                                      #clear memory
                tweet_ids = []                                              #reinitialise list of tweet ids
                n_grams = []                                                #reinitialise list of n_grams
                print(i)                                                    #print progress

            if (i==max(all_data.index)):                                          #as above but for the last chunk of tweets
                filename = folder + 'progress_' + str(i)+'.pickle'
                df = pd.DataFrame({'tweet_id':tweet_ids,
                                   'n_gram':n_grams})
                df.to_pickle(filename)
                del df
                tweet_ids = []
                n_grams = [] 
                print(i)
    else:
        print("Files already exist in this folder.")

## Find and save char ngrams

In [13]:
progs = [500,500,500,500]
folders = ['features/bio_char_4grams/','features/bio_char_3grams/','features/bio_char_2grams/','features/bio_char_1grams/']
n_gram_sizes = [4,3,2,1]

for prog, folder, n_gram_size in zip(progs, folders, n_gram_sizes):
    print('n_gram_size = ' + str(n_gram_size))
    create_ngrams(prog, folder, n_gram_size, all_data)

n_gram_size = 4
features/bio_char_4grams/
1
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 3
features/bio_char_3grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 2
features/bio_char_2grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103
n_gram_size = 1
features/bio_char_1grams/
1
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5103


In [14]:
x = pd.read_pickle('features/bio_char_4grams/progress_2500.pickle')
print(x.head(20))

   n_gram             tweet_id
0    spor  1148184591532011526
1    port  1148184591532011526
2    orts  1148184591532011526
3    rtsw  1148184591532011526
4    tswr  1148184591532011526
5    swri  1148184591532011526
6    writ  1148184591532011526
7    rit_  1148184591532011526
8    it_b  1148184591532011526
9    t_br  1148184591532011526
10   _bro  1148184591532011526
11   broa  1148184591532011526
12   road  1148184591532011526
13   oadc  1148184591532011526
14   adca  1148184591532011526
15   dcas  1148184591532011526
16   cast  1148184591532011526
17   ast_  1148184591532011526
18   st_s  1148184591532011526
19   t_st  1148184591532011526


In [15]:
x.shape[0]

26491