## Common imports

In [1]:
import pandas as pd
import numpy as np
import glob
from datetime import datetime

## Function for creating feature matrix

In [2]:
def ngram_matrix(folders):
    i=1 #counter
    for folder in folders:
        print(folder)
        location = 'features/' + folder + '*.pickle'
        for file in glob.glob(location):                         #loop through each saved file
            #print(file)
            df = pd.read_pickle(file)                                        #read each file into a dataframe
            df['tweet_id'] = df.tweet_id.astype(str)
            if (i==1):                                                       #if we're on the first iteration
                ngrams = df
            else:
                ngrams = ngrams.append(df)                             #append if we're not on the first iiteration
            i = i+1

    ngrams['count'] = 1                                               #add a filed to count occurences
    ngrams.columns = ['feature','tweet_ids','value']                  #change the column names

    print('aggregating')
    features_agg = ngrams.groupby(['tweet_ids', 'feature'])['value'].count().reset_index() #aggregate the full dataframe
    x = features_agg
    x = x.groupby(['feature'])['value'].sum().reset_index()
    print(x.shape)
    print(x.columns)
    x = x[x['value']>839//20]                                    #term must exist in at least 5% of the smaller class
    frequent_features = np.unique(x.feature)
    features_agg = features_agg[features_agg.feature.isin(frequent_features)]

    print('pivoting')
    features_agg_pivot = features_agg.pivot_table(index=['tweet_ids'], columns='feature', values='value').reset_index() #pivot
    print('filling nans')
    features_agg_pivot.fillna(0, inplace=True)                                #replace nans with zeros
    print(features_agg_pivot.shape) #for convenience
    print(len(np.unique(features_agg_pivot.tweet_ids)))
    return(features_agg_pivot,features_agg)

## Create ngrams matrices

In [3]:
tweet_word_folders = ['tweet_word_4grams/',
           'tweet_word_3grams/',
           'tweet_word_2grams/',
           'tweet_word_1grams/']

tweet_char_folders = ['tweet_char_4grams/',
           'tweet_char_3grams/',
           'tweet_char_2grams/',
           'tweet_char_1grams/']

bio_word_folders = ['bio_word_4grams/',
           'bio_word_3grams/',
           'bio_word_2grams/',
           'bio_word_1grams/']

bio_char_folders = ['bio_char_4grams/',
           'bio_char_3grams/',
           'bio_char_2grams/',
           'bio_char_1grams/']

tweetbio_word_folders = np.append(tweet_word_folders,bio_word_folders)
tweetbio_char_folders = np.append(tweet_char_folders,bio_char_folders)

tweet_w_ngrams_matrix, tweet_w_ngrams_list = ngram_matrix(tweet_word_folders) #word ngrams in tweets
tweet_c_ngrams_matrix, tweet_c_ngrams_list = ngram_matrix(tweet_char_folders) #char ngrams in tweets
tweetbio_w_ngrams_matrix, tweetbio_w_ngrams_list = ngram_matrix(tweetbio_word_folders) #word ngrams in bios
tweetbio_c_ngrams_matrix, tweetbio_c_ngrams_list = ngram_matrix(tweetbio_char_folders) #char ngrams in bios

tweet_word_4grams/
tweet_word_3grams/
tweet_word_2grams/
tweet_word_1grams/
aggregating
(95212, 2)
Index(['feature', 'value'], dtype='object')
pivoting
filling nans
(4991, 221)
4991
tweet_char_4grams/
tweet_char_3grams/
tweet_char_2grams/
tweet_char_1grams/
aggregating
(30736, 2)
Index(['feature', 'value'], dtype='object')
pivoting
filling nans
(5097, 2886)
5097
tweet_word_4grams/
tweet_word_3grams/
tweet_word_2grams/
tweet_word_1grams/
bio_word_4grams/
bio_word_3grams/
bio_word_2grams/
bio_word_1grams/
aggregating
(176015, 2)
Index(['feature', 'value'], dtype='object')
pivoting
filling nans
(5077, 414)
5077
tweet_char_4grams/
tweet_char_3grams/
tweet_char_2grams/
tweet_char_1grams/
bio_char_4grams/
bio_char_3grams/
bio_char_2grams/
bio_char_1grams/
aggregating
(56674, 2)
Index(['feature', 'value'], dtype='object')
pivoting
filling nans
(5104, 5205)
5104


## Load the training data to obtain classifications

In [4]:
train_data = pd.read_pickle('pickle_files/train_data_formatted.pickle')
train_data['Tweet ID'] = train_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
eval_data = pd.read_pickle('pickle_files\eval_data_formatted.pickle')
eval_data['Tweet ID'] = eval_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)
tweet_class = all_data.loc[:,['Tweet ID','class']] #get the id and the class column
tweet_class.columns = ['Tweet ID','class_column'] #change name of 'class' to 'class_column' ('class' might be a unigram)

## Function for performing Chi^2 test on all features

In [5]:
def chi2test(df):    
    df_class = pd.merge(df,tweet_class, left_on='tweet_ids',right_on='Tweet ID', how='inner') #get the class for instances in df
    Y = df_class.loc[:,'class_column'].copy()                                                 #create list of classes
    df_class = df_class.drop(['class_column','Tweet ID','tweet_ids'], axis=1)                 #drop class and id features from df
    X = df_class.values                                                                       #create matrix of features

    from sklearn.feature_selection import chi2
    current = datetime.now()
    x = chi2(X,Y)                                             #perfrom chi2 test
    time_taken = datetime.now() - current
    print('time to process chi 2 = ',time_taken)
    
    important_cols = df_class.columns[x[1]<0.05]              #create list of features where p-value < 0.05
    
    return(important_cols)

## Function for reducing features based on correlation

In [6]:
def correlation(df,coeff):
    current = datetime.now()
    print('Creating correlation matrix')
    corr = df.iloc[:,1:].corr()             #create the correlayion matrix
    x = datetime.now() - current
    print('time to process correlation matrix = ',x)

    x = datetime.now() - current
    ids = df.iloc[:,:1]                     #create a list for the ids to be added back later
    print('number of features = ',len(corr.index))#print the number of features to be analysed
    i = 1
    for m in corr.index:                          #loop through each row
        for n in corr.index:                #loop through each column
            if m!=n:                        #ignore if row equals column
                try:
                    r = corr.loc[m,n]           #get correlation value at intersection
                    if r>coeff:                   #if correlation value is greater than 0.9
                        corr = corr.drop(n, axis=1)
                except:
                    pass
        if (i//100==i/100):
            print('progress = ',str(i))
        i = i+1

    df = df[corr.columns]
    df['tweet_id'] = ids
    x = datetime.now() - current
    print('time to check all features = ',x)

    return(df)   

## Function to combine 'chi2test' and 'correlation' functions 

In [7]:
def chi2corr(df_matrix):
    df_features = chi2test(df_matrix)                                               #get list of features where p<0.05
    df_features = np.append('tweet_ids',df_features)                                #append tweet_id to feature list
    df_chi = df_matrix[df_features]                                                 #reduce features using chi2test output
    df_chi_corr = correlation(df_chi,0.9)                                           #reduce features based on correlation
    return(df_chi_corr)



## Perform chi^2 test and corr test and reduce features based on result

In [8]:
tweet_w_features = chi2corr(tweet_w_ngrams_matrix)

time to process chi 2 =  0:00:00.171837
Creating correlation matrix
time to process correlation matrix =  0:00:00.349690
number of features =  168
progress =  100
time to check all features =  0:00:01.626593


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
tweet_c_features = chi2corr(tweet_c_ngrams_matrix)

time to process chi 2 =  0:00:00.062489
Creating correlation matrix
time to process correlation matrix =  0:00:37.867058
number of features =  1817
progress =  100


In [None]:
tweetbio_w_features = chi2corr(tweetbio_w_ngrams_matrix)

In [11]:
tweetbio_c_features = chi2corr(tweetbio_c_ngrams_matrix)

time to process chi 2 =  0:00:00.150595
Creating correlation matrix
time to process correlation matrix =  0:01:10.571408
number of features =  2532
progress =  100
progress =  200
progress =  300
progress =  400
progress =  500
progress =  600
progress =  700
progress =  800
progress =  900
progress =  1000
progress =  1100
progress =  1200
progress =  1300
progress =  1400
progress =  1500
progress =  1600
progress =  1700
progress =  1800
progress =  1900
progress =  2000
progress =  2100
progress =  2200
progress =  2300
progress =  2400
progress =  2500
time to check all features =  0:05:50.718320


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
#shapes
print(tweet_w_features.shape)
print(tweet_c_features.shape)
print(tweetbio_w_features.shape)
print(tweetbio_c_features.shape)

(4991, 124)
(5097, 1156)
(5077, 193)
(5104, 1804)


## Function for calculating feature term frequencies

In [13]:
def termfrequency(df):
    df_tf = df.copy()
    df_tf['sum_of_counts'] = df_tf.sum(axis=1) #get sum of counts across axis 0
    for c in df_tf.columns: #loop through each column
        if c != 'tweet_id': #skip the tweet id column
            df_tf[c] = df_tf[c]/df_tf['sum_of_counts'] #divide column value by the sum of row value
    df_tf = df_tf.drop('sum_of_counts', axis=1) #drop the sum
    df_tf = df_tf.fillna(0)
    
    return(df_tf)

## Get the feature term frequencies

In [14]:
tweet_w_features_tf = termfrequency(tweet_w_features)
tweet_c_features_tf = termfrequency(tweet_c_features)
tweetbio_w_features_tf = termfrequency(tweetbio_w_features)
tweetbio_c_features_tf = termfrequency(tweetbio_c_features)


In [15]:
#shapes
print(tweet_w_features_tf.shape)
print(tweet_c_features_tf.shape)
print(tweetbio_w_features_tf.shape)
print(tweetbio_c_features_tf.shape)

(4991, 124)
(5097, 1156)
(5077, 193)
(5104, 1804)


## Function for converting tf to tf-idf

In [16]:
def idf(df_tf):    
    df_tfidf = df_tf.copy()
    for c in df_tfidf.columns: #loop through each column
        if c != 'tweet_id': #skip the tweet id column
            df_tfidf[c] = df_tfidf[c]*(np.log(df_tfidf.shape[0]/(df_tfidf[c][df_tfidf[c]>0].count()))+1)
    return(df_tfidf)

In [17]:
tweet_w_features_tfidf = idf(tweet_w_features_tf)
tweet_c_features_tfidf = idf(tweet_c_features_tf)
tweetbio_w_features_tfidf = idf(tweetbio_w_features_tf)
tweetbio_c_features_tfidf = idf(tweetbio_c_features_tf)

In [18]:
#shapes
print(tweet_w_features_tfidf.shape)
print(tweet_c_features_tfidf.shape)
print(tweetbio_w_features_tfidf.shape)
print(tweetbio_c_features_tfidf.shape)

(4991, 124)
(5097, 1156)
(5077, 193)
(5104, 1804)


## Count features

In [19]:
count_features = pd.read_pickle('features/count_features/count_features.pickle') #get the count features
count_features['tweet_ids'] = count_features.tweet_ids.astype(str)

#features_agg = features_agg.append(count_features)                          #append count features on to ngrams

print('pivoting')
count_features_pivot = count_features.pivot_table(index=['tweet_ids'], columns='feature', values='value').reset_index() #pivot
print('filling nans')
count_features_pivot.fillna(0, inplace=True)                                #replace nans with zeros
print(count_features_pivot.shape) #for convenience
print(len(np.unique(count_features_pivot.tweet_ids)))

print(count_features_pivot.columns)
count_features = chi2corr(count_features_pivot) #select important features
print(count_features.columns)


pivoting
filling nans
(5104, 16)
5104
Index(['tweet_ids', '1char_token_count', 'ave_chars_token', 'caps_count',
       'character_count', 'followers_count', 'following_count',
       'hashtag_count', 'masked_count', 'mention_count', 'modals_count',
       'posted_tweets_count', 'punctuation_count', 'quotes_count',
       'token_count', 'url_count'],
      dtype='object', name='feature')
time to process chi 2 =  0:00:00.003987
Creating correlation matrix
time to process correlation matrix =  0:00:00.001995
number of features =  11
time to check all features =  0:00:00.049867
Index(['ave_chars_token', 'caps_count', 'followers_count', 'following_count',
       'mention_count', 'posted_tweets_count', 'punctuation_count',
       'quotes_count', 'url_count', 'tweet_id'],
      dtype='object', name='feature')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
df_tweet = pd.merge(count_features, tweet_w_features, on='tweet_id', how='left').fillna(0)
df_tweet = pd.merge(df_tweet, tweet_c_features, on='tweet_id', how='left').fillna(0)

df_tweet_tf = pd.merge(count_features, tweet_w_features_tf, on='tweet_id', how='left').fillna(0)
df_tweet_tf = pd.merge(df_tweet_tf, tweet_c_features_tf, on='tweet_id', how='left').fillna(0)

df_tweet_tfidf = pd.merge(count_features, tweet_w_features_tfidf, on='tweet_id', how='left').fillna(0)
df_tweet_tfidf = pd.merge(df_tweet_tfidf, tweet_c_features_tfidf, on='tweet_id', how='left').fillna(0)

df_tweetbio = pd.merge(count_features, tweetbio_w_features, on='tweet_id', how='left').fillna(0)
df_tweetbio = pd.merge(df_tweetbio, tweetbio_c_features, on='tweet_id', how='left').fillna(0)

df_tweetbio_tf = pd.merge(count_features, tweetbio_w_features_tf, on='tweet_id', how='left').fillna(0)
df_tweetbio_tf = pd.merge(df_tweetbio_tf, tweetbio_c_features_tf, on='tweet_id', how='left').fillna(0)

df_tweetbio_tfidf = pd.merge(count_features, tweetbio_w_features_tfidf, on='tweet_id', how='left').fillna(0)
df_tweetbio_tfidf = pd.merge(df_tweetbio_tfidf, tweetbio_c_features_tfidf, on='tweet_id', how='left').fillna(0)

print(df_tweet.shape)
print(df_tweet_tf.shape)
print(df_tweet_tfidf.shape)
print(df_tweetbio.shape)
print(df_tweetbio_tf.shape)
print(df_tweetbio_tfidf.shape)

(5104, 1288)
(5104, 1288)
(5104, 1288)
(5104, 2005)
(5104, 2005)
(5104, 2005)


## function to split features matrix into train and eval for tweet-only features

In [21]:
def split(df):
    c_f = np.delete(count_features.columns, np.where(count_features.columns=='tweet_id')) #get the count features column names
    features = df.copy()
    tweet_features_class = pd.merge(features,tweet_class,left_on='tweet_id',right_on='Tweet ID')
    tweet_features_class = tweet_features_class.drop('Tweet ID', axis=1)
    tweet_features_train = tweet_features_class[tweet_features_class.tweet_id.isin(train_data['Tweet ID'])]
    tweet_features_eval = tweet_features_class[tweet_features_class.tweet_id.isin(eval_data['Tweet ID'])]
    tweet_features_train_nc = tweet_features_train.drop(c_f,axis=1)
    tweet_features_eval_nc = tweet_features_eval.drop(c_f,axis=1)
    return(tweet_features_train,tweet_features_eval,tweet_features_train_nc,tweet_features_eval_nc)


In [22]:
df_tweet_train, df_tweet_eval, df_tweet_train_nc, df_tweet_eval_nc= split(count_features)
df_tweet_train.to_pickle('features/df_tweet_count_features_train.pickle')
df_tweet_eval.to_pickle('features/df_tweet_count_features_eval.pickle')

df_tweet_train, df_tweet_eval, df_tweet_train_nc, df_tweet_eval_nc = split(df_tweet)
df_tweet_train.to_pickle('features/df_tweet_train.pickle')
df_tweet_eval.to_pickle('features/df_tweet_eval.pickle')
df_tweet_train_nc.to_pickle('features/df_tweet_train_nc.pickle')
df_tweet_eval_nc.to_pickle('features/df_tweet_eval_nc.pickle')

df_tweet_tf_train, df_tweet_tf_eval, df_tweet_tf_train_nc, df_tweet_tf_eval_nc = split(df_tweet_tf)
df_tweet_tf_train.to_pickle('features/df_tweet_tf_train.pickle')
df_tweet_tf_eval.to_pickle('features/df_tweet_tf_eval.pickle')
df_tweet_tf_train_nc.to_pickle('features/df_tweet_tf_train_nc.pickle')
df_tweet_tf_eval_nc.to_pickle('features/df_tweet_tf_eval_nc.pickle')

df_tweet_tfidf_train, df_tweet_tfidf_eval, df_tweet_tfidf_train_nc, df_tweet_tfidf_eval_nc = split(df_tweet_tfidf)
df_tweet_tfidf_train.to_pickle('features/df_tweet_tfidf_train.pickle')
df_tweet_tfidf_eval.to_pickle('features/df_tweet_tfidf_eval.pickle')
df_tweet_tfidf_train_nc.to_pickle('features/df_tweet_tfidf_train_nc.pickle')
df_tweet_tfidf_eval_nc.to_pickle('features/df_tweet_tfidf_eval_nc.pickle')

df_tweetbio_train, df_tweetbio_eval, df_tweetbio_train_nc, df_tweetbio_eval_nc = split(df_tweetbio)
df_tweetbio_train.to_pickle('features/df_tweetbio_train.pickle')
df_tweetbio_eval.to_pickle('features/df_tweetbio_eval.pickle')
df_tweetbio_train_nc.to_pickle('features/df_tweetbio_train_nc.pickle')
df_tweetbio_eval_nc.to_pickle('features/df_tweetbio_eval_nc.pickle')

df_tweetbio_tf_train, df_tweetbio_tf_eval, df_tweetbio_tf_train_nc, df_tweetbio_tf_eval_nc = split(df_tweetbio_tf)
df_tweetbio_tf_train.to_pickle('features/df_tweetbio_tf_train.pickle')
df_tweetbio_tf_eval.to_pickle('features/df_tweetbio_tf_eval.pickle')
df_tweetbio_tf_train_nc.to_pickle('features/df_tweetbio_tf_train_nc.pickle')
df_tweetbio_tf_eval_nc.to_pickle('features/df_tweetbio_tf_eval_nc.pickle')

df_tweetbio_tfidf_train, df_tweetbio_tfidf_eval, df_tweetbio_tfidf_train_nc, df_tweetbio_tfidf_eval_nc = split(df_tweetbio_tfidf)
df_tweetbio_tfidf_train.to_pickle('features/df_tweetbio_tfidf_train.pickle')
df_tweetbio_tfidf_eval.to_pickle('features/df_tweetbio_tfidf_eval.pickle')
df_tweetbio_tfidf_train_nc.to_pickle('features/df_tweetbio_tfidf_train_nc.pickle')
df_tweetbio_tfidf_eval_nc.to_pickle('features/df_tweetbio_tfidf_eval_nc.pickle')