## Common Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load the data

In [2]:
train_data = pd.read_pickle(r'C:\Users\scott\Dissertation\data_sort\pickle_files\eval_data_formatted.pickle')
eval_data = pd.read_pickle(r'C:\Users\scott\Dissertation\data_sort\pickle_files\train_data_formatted.pickle')
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)

## Load Stopwords

In [3]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [4]:
all_data['Tweet_original'] = all_data.Tweet.copy() #keep a copy of the original tweet text
all_data['Tweet'] = all_data['Tweet'].str.lower() #lowercase the text 

## Remove punctuation, usernames, hashtags, URLs

In [5]:
all_data['Tweet'] = all_data['Tweet'].fillna('')
p = re.compile(r'[^\w\s]+')
all_data['Tweet'] = [p.sub('', x) for x in all_data['Tweet'].tolist()] #remove the punctuation
for i in all_data.index:
    #print(i)
    #all_data.loc[i,'Tweet'] =re.sub('[^A-Za-z0-9]+',"",all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("@[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("http[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("#[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
print(len(np.unique(all_data['Tweet ID']))) #for convenience

6297


## Remove stopwords

In [6]:
all_data['Tweet'] = all_data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(all_data['Tweet ID']))) #for convenience

6297


## Tokenize

In [7]:
all_data['Tweet'] = all_data['Tweet'].apply(word_tokenize) #tokenize the text
print(len(np.unique(all_data['Tweet ID']))) #for convenience

6297


## Count of tokens

In [29]:
tweet_ids = []
feature = []
value = []
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'token_count')
    value = np.append(value,len(all_data.Tweet[i]))
    

## Count of URLs

In [32]:
all_data.Web = all_data.Web.fillna('')
tweet_ids = np.append(tweet_ids,all_data['Tweet ID'])
feature = np.append(feature,np.repeat('url_count',len(all_data.index)))
value = np.append(value,all_data.Web.str.count('http'))

## Count of Hashtags

In [34]:
all_data.Hashtags = all_data.Hashtags.fillna('')
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'hashtag_count')
    if (len(all_data.Hashtags[i])==0):
        value = np.append(value,all_data.Hashtags[i].count(' '))
    else:
        value = np.append(value,all_data.Hashtags[i].count(' ')+1)
    

## Count of Mentions

In [36]:
all_data.Mentions = all_data.Mentions.fillna('')
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'mention_count')
    if (len(all_data.Mentions[i])==0):
        value = np.append(value,all_data.Mentions[i].count(' '))
    else:
        value = np.append(value,all_data.Mentions[i].count(' ')+1)

## Count of Caps

In [38]:
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'caps_count')
    value = np.append(value,sum(1 for c in all_data.Tweet_original[i] if c.isupper()))

## Count of Characters

In [40]:
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'character_count')
    value = np.append(value,len(all_data.Tweet_original[i]))

## Count of followers

In [42]:
tweet_ids = np.append(tweet_ids,all_data['Tweet ID'])
feature = np.append(feature,np.repeat('followers_count',len(all_data.index)))
value = np.append(value,all_data['Number of Followers'])

## Count of Following

In [44]:
tweet_ids = np.append(tweet_ids,all_data['Tweet ID'])
feature = np.append(feature,np.repeat('following_count',len(all_data.index)))
value = np.append(value,all_data['Number Following'])

## Count of Posted Tweets

In [46]:
tweet_ids = np.append(tweet_ids,all_data['Tweet ID'])
feature = np.append(feature,np.repeat('posted_tweets_count',len(all_data.index)))
value = np.append(value,all_data['Number of Tweets'])

## Count of Modals

In [48]:
modals = ['can', 'could', 'may', 'might', 'must', 'will', 'would', 'should']
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'modals_count')
    
    count = 0
    for m in modals:
        for t in all_data.Tweet[i]:
            if m==t:
                count = count+1    
    
    value = np.append(value,count)    


## Count of tokens with non-alpha characters in the middle (masked)

In [50]:
rx = re.compile("^[a-zA-ZáéíóúüÁÉÍÓÚÜ]+[0-9]+[a-zA-ZáéíóúüÁÉÍÓÚÜ]+$") #regex for alpha - non-alpha - alpha
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'masked_count')
    
    count = 0
    for w in all_data.Tweet[i]:
        count = count + len(rx.findall(w))
    
    value = np.append(value,count) 

## Count of 1 character tokens

In [52]:
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'1char_token_count')
    
    count=0
    for t in all_data.Tweet[i]:
        if len(t)==1:
            if str.isalpha(t):
                count = count+1
    value = np.append(value,count)

## Count of quotations

In [54]:
rx = re.compile('\"(.+?)\"') #regex for double quoted text
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'quotes_count')
    value = np.append(value,len(rx.findall(all_data.Tweet_original[i]))) 

## Count of punctuation

In [56]:
import string
count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
for i in all_data.index:
    tweet_ids = np.append(tweet_ids,all_data['Tweet ID'][i])
    feature = np.append(feature,'punctuation_count')
    value = np.append(value,count(all_data.Tweet_original[i], string.punctuation)) 

In [57]:
print(len(value))
print(len(feature))
print(len(tweet_ids))

88158
88158
88158


In [62]:
df_f = pd.DataFrame({'tweet_ids':tweet_ids,
                        'feature':feature,
                        'value':value})
df_f.to_pickle('count_features/count_features.pickle')
df_f_pivot = df_f.pivot_table(index=['tweet_ids'], columns='feature', values='value').reset_index() #pivot
df_f_pivot.shape

(6297, 15)

In [63]:
df_f_pivot.iloc[20:30,:]

feature,tweet_ids,1char_token_count,caps_count,character_count,followers_count,following_count,hashtag_count,masked_count,mention_count,modals_count,posted_tweets_count,punctuation_count,quotes_count,token_count,url_count
20,1147048279135404033,0.0,3.0,52.0,1962.0,4997.0,0.0,0.0,1.0,0.0,26969.0,6.0,0.0,5.0,0.0
21,1147048385624649728,0.0,2.0,42.0,55.0,93.0,0.0,0.0,3.0,0.0,1005.0,0.0,0.0,5.0,0.0
22,1147049922304380928,0.0,22.0,262.0,11724.0,481.0,0.0,0.0,2.0,0.0,11358.0,7.0,0.0,25.0,0.0
23,1147050738964672513,0.0,16.0,256.0,1091.0,377.0,0.0,0.0,0.0,1.0,57184.0,9.0,1.0,27.0,1.0
24,1147058494010527745,0.0,8.0,141.0,483.0,487.0,0.0,0.0,0.0,2.0,5749.0,2.0,0.0,16.0,1.0
25,1147061621791100928,0.0,5.0,72.0,1866.0,1743.0,0.0,0.0,0.0,1.0,10952.0,6.0,0.0,10.0,1.0
26,1147063087163420673,0.0,3.0,114.0,1807.0,455.0,0.0,0.0,1.0,0.0,59924.0,4.0,0.0,13.0,0.0
27,1147066323761057792,0.0,7.0,91.0,2112.0,292.0,0.0,0.0,0.0,0.0,56795.0,7.0,0.0,10.0,0.0
28,1147066487552774144,0.0,6.0,195.0,630.0,1368.0,2.0,0.0,1.0,0.0,2977.0,3.0,0.0,15.0,1.0
29,1147068052124721153,0.0,9.0,215.0,94.0,439.0,0.0,0.0,2.0,0.0,757.0,8.0,0.0,20.0,0.0
