## Common imports

In [1]:
import pandas as pd
import numpy as np
import glob
from datetime import datetime

## Function for creating feature matrix

In [2]:
def ngram_matrix(folders):
    i=1 #counter
    for folder in folders:
        print(folder)
        location = folder + '*.pickle'
        for file in glob.glob(location):                         #loop through each saved file
            #print(file)
            df = pd.read_pickle(file)                                        #read each file into a dataframe
            df['tweet_id'] = df.tweet_id.astype(str)
            if (i==1):                                                       #if we're on the first iteration
                ngrams = df
            else:
                ngrams = ngrams.append(df)                             #append if we're not on the first iiteration
            i = i+1

    ngrams['count'] = 1                                               #add a filed to count occurences
    ngrams.columns = ['feature','tweet_ids','value']                  #change the column names

    print('aggregating')
    features_agg = ngrams.groupby(['tweet_ids', 'feature'])['value'].count().reset_index() #aggregate the full dataframe
    x = features_agg
    x = x.groupby(['feature'])['value'].sum().reset_index()
    print(x.shape)
    print(x.columns)
    x = x[x['value']>839//20]                                    #term must exist in at least 5% of the smaller class
    frequent_features = np.unique(x.feature)
    features_agg = features_agg[features_agg.feature.isin(frequent_features)]

    count_features = pd.read_pickle('count_features/count_features.pickle') #get the count features
    count_features['tweet_ids'] = count_features.tweet_ids.astype(str)

    features_agg = features_agg.append(count_features)                          #append count features on to ngrams

    print('pivoting')
    features_agg_pivot = features_agg.pivot_table(index=['tweet_ids'], columns='feature', values='value').reset_index() #pivot
    print('filling nans')
    features_agg_pivot.fillna(0, inplace=True)                                #replace nans with zeros
    print(features_agg_pivot.shape) #for convenience
    print(len(np.unique(features_agg_pivot.tweet_ids)))
    return(features_agg_pivot,features_agg)

## Create ngrams matrices

In [3]:
tweet_word_folders = ['tweet_word_4grams/',
           'tweet_word_3grams/',
           'tweet_word_2grams/',
           'tweet_word_1grams/']

tweet_char_folders = ['tweet_char_4grams/',
           'tweet_char_3grams/',
           'tweet_char_2grams/',
           'tweet_char_1grams/']

bio_word_folders = ['bio_word_4grams/',
           'bio_word_3grams/',
           'bio_word_2grams/',
           'bio_word_1grams/']

bio_char_folders = ['bio_char_4grams/',
           'bio_char_3grams/',
           'bio_char_2grams/',
           'bio_char_1grams/']
tweet_folders = np.append(tweet_word_folders,tweet_char_folders)
tweetbio_folders = np.append(tweet_folders,np.append(bio_word_folders,bio_char_folders))
tweet_ngrams_matrix, tweet_ngrams_list = ngram_matrix(tweet_folders) #word/char ngrams in tweets
tweetbio_ngrams_matrix, tweetbio_ngrams_list = ngram_matrix(tweetbio_folders) #word/char ngrams in tweets and bios

tweet_word_4grams/
tweet_word_3grams/
tweet_word_2grams/
tweet_word_1grams/
tweet_char_4grams/
tweet_char_3grams/
tweet_char_2grams/
tweet_char_1grams/
aggregating
(124160, 2)
Index(['feature', 'value'], dtype='object')
pivoting
filling nans
(5104, 3102)
5104
tweet_word_4grams/
tweet_word_3grams/
tweet_word_2grams/
tweet_word_1grams/
tweet_char_4grams/
tweet_char_3grams/
tweet_char_2grams/
tweet_char_1grams/
bio_word_4grams/
bio_word_3grams/
bio_word_2grams/
bio_word_1grams/
bio_char_4grams/
bio_char_3grams/
bio_char_2grams/
bio_char_1grams/
aggregating
(229541, 2)
Index(['feature', 'value'], dtype='object')
pivoting
filling nans
(5104, 5563)
5104


## Load the training data to obtain classifications

In [4]:
train_data = pd.read_pickle(r'C:\Users\scott\Dissertation\data_sort\pickle_files\train_data_formatted.pickle')
train_data['Tweet ID'] = train_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
eval_data = pd.read_pickle(r'C:\Users\scott\Dissertation\data_sort\pickle_files\eval_data_formatted.pickle')
eval_data['Tweet ID'] = eval_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)
tweet_class = all_data.loc[:,['Tweet ID','class']] #get the id and the class column
tweet_class.columns = ['Tweet ID','class_column'] #change name of 'class' to 'class_column' ('class' might be a unigram)

## Function for performing Chi^2 test on all features

In [5]:
def chi2test(df):    
    df_class = pd.merge(df,tweet_class, left_on='tweet_ids',right_on='Tweet ID', how='inner') #get the class for instances in df
    Y = df_class.loc[:,'class_column'].copy()                                                 #create list of classes
    df_class = df_class.drop(['class_column','Tweet ID','tweet_ids'], axis=1)                 #drop class and id features from df
    X = df_class.values                                                                       #create matrix of features

    from sklearn.feature_selection import chi2
    current = datetime.now()
    x = chi2(X,Y)                                             #perfrom chi2 test
    time_taken = datetime.now() - current
    print('time to process chi 2 = ',time_taken)
    
    important_cols = df_class.columns[x[1]<0.05]              #create list of features where p-value < 0.05
    
    return(important_cols)

## Function for reducing features based on correlation

In [6]:
def correlation(df,coeff):
    current = datetime.now()
    print('Creating correlation matrix')
    corr = df.iloc[:,1:].corr()             #create the correlayion matrix
    x = datetime.now() - current
    print('time to process correlation matrix = ',x)

    x = datetime.now() - current
    ids = df.iloc[:,:1]                     #create a list for the ids to be added back later
    print('number of features = ',len(corr.index))#print the number of features to be analysed
    i = 1
    for m in corr.index:                          #loop through each row
        for n in corr.index:                #loop through each column
            if m!=n:                        #ignore if row equals column
                try:
                    r = corr.loc[m,n]           #get correlation value at intersection
                    if r>coeff:                   #if correlation value is greater than 0.9
                        corr = corr.drop(n, axis=1)
                except:
                    pass
        if (i//100==i/100):
            print('progress = ',str(i))
        i = i+1

    df = df[corr.columns]
    df['tweet_id'] = ids
    x = datetime.now() - current
    print('time to check all features = ',x)

    return(df)   

## Function to combine 'chi2test' and 'correlation' functions 

In [7]:
def chi2corr(df_matrix):
    df_features = chi2test(df_matrix)                                               #get list of features where p<0.05
    df_features = np.append('tweet_ids',df_features)                                #append tweet_id to feature list
    df_chi = df_matrix[df_features]                                                 #reduce features using chi2test output
    df_chi_corr = correlation(df_chi,0.9)                                           #reduce features based on correlation
    return(df_chi_corr)



## Perform chi^2 test and corr test and reduce features based on result

In [8]:
tweet_features = chi2corr(tweet_ngrams_matrix)

time to process chi 2 =  0:00:00.167690
Creating correlation matrix
time to process correlation matrix =  0:01:27.820689
number of features =  1977
progress =  100
progress =  200
progress =  300
progress =  400
progress =  500
progress =  600
progress =  700
progress =  800
progress =  900
progress =  1000
progress =  1100
progress =  1200
progress =  1300
progress =  1400
progress =  1500
progress =  1600
progress =  1700
progress =  1800
progress =  1900
time to check all features =  0:05:02.656118


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
tweetbio_features = chi2corr(tweetbio_ngrams_matrix)

time to process chi 2 =  0:00:00.171833
Creating correlation matrix
time to process correlation matrix =  0:01:23.466773
number of features =  2760
progress =  100
progress =  200
progress =  300
progress =  400
progress =  500
progress =  600
progress =  700
progress =  800
progress =  900
progress =  1000
progress =  1100
progress =  1200
progress =  1300
progress =  1400
progress =  1500
progress =  1600
progress =  1700
progress =  1800
progress =  1900
progress =  2000
progress =  2100
progress =  2200
progress =  2300
progress =  2400
progress =  2500
progress =  2600
progress =  2700
time to check all features =  0:07:10.995862


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## ***add code here to add count features to the matrices 

## Split features matrix into train and eval for tweet-only features

In [10]:
tweet_features_class = pd.merge(tweet_features,tweet_class,left_on='tweet_id',right_on='Tweet ID')
tweet_features_class = tweet_features_class.drop('Tweet ID', axis=1)
tweet_features_train = tweet_features_class[tweet_features_class.tweet_id.isin(train_data['Tweet ID'])]
tweet_features_train.to_pickle('tweet_features_train.pickle')
tweet_features_eval = tweet_features_class[tweet_features_class.tweet_id.isin(eval_data['Tweet ID'])]
tweet_features_eval.to_pickle('tweet_features_eval.pickle')

## Split features matrix into train and eval for tweet plus bio features

In [11]:
tweetbio_features_class = pd.merge(tweetbio_features,tweet_class,left_on='tweet_id',right_on='Tweet ID')
tweetbio_features_class = tweetbio_features_class.drop('Tweet ID', axis=1)
tweetbio_features_train = tweetbio_features_class[tweetbio_features_class.tweet_id.isin(train_data['Tweet ID'])]
tweetbio_features_train.to_pickle('tweetbio_features_train.pickle')
tweetbio_features_eval = tweetbio_features_class[tweetbio_features_class.tweet_id.isin(eval_data['Tweet ID'])]
tweetbio_features_eval.to_pickle('tweetbio_features_eval.pickle')

In [12]:
print(tweetbio_features_eval.shape)

(1021, 1888)
