In [26]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import chisquare, pearsonr, ttest_ind

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses, optimizers, metrics

# Preprocessing

In [4]:
df = pd.read_csv('datasets/TwitCID/firstweek_features_binary.csv').drop('Unnamed: 0', axis=1)

In [5]:
df.head()

Unnamed: 0,Has_location,Has_username,Followers,Followees,Age_account,Total_tweets,Favourite,Groups,Has_Image,Aver_favourite,...,Has_tvshow,Posted_noon,Posted_weeke,Posted_eve,Has_excl,Num_hashtag,Opt_len,Has_suggest,Has_video,@@class@@
0,0,0,39113.0,10.0,421.0,1221.0,145.0,1489.0,1,0.34,...,0,0,1,0,0,0.0,0,0,0,1
1,0,0,663748.0,28.0,2025.0,4658.0,3.0,631.0,0,0.0,...,0,0,1,0,0,0.0,0,0,0,1
2,1,0,1836.0,597.0,242.0,272.0,0.0,3.0,0,0.0,...,0,0,1,1,0,0.0,0,0,0,1
3,0,0,3494.0,4617.0,226.0,1548.0,711.0,29.0,0,3.15,...,0,0,0,0,0,0.0,0,0,0,1
4,0,0,523.0,714.0,375.0,5715.0,3201.0,1.0,0,8.54,...,0,0,1,0,1,0.0,0,0,0,1


In [13]:
# Functions
def calc_ttest(df, pred, resp, values):
    assert len(values) == 2
    
    subset1 = df[df[resp] == values[0]]
    subset2 = df[df[resp] == values[1]]
    
    return ttest_ind(subset1[pred], subset2[pred], equal_var=False)

def filter_significant_t(df, cls, alpha):
    col_list = []
    columns = list(df.columns)
    columns.remove(cls)
    
    for col in columns:
        result = calc_ttest(df, col, cls, [0, 1])
        
        if result.pvalue <= alpha:
            col_list.append(col)
    
    return df[col_list]

def calc_chisquare(df, pred, resp, value, silent=False):
    subset = df[df[resp] == value]
    true_unique, true_counts = np.unique(df[pred], return_counts=True)
    
    unique, counts = np.unique(subset[pred], return_counts=True)
    sample_counts = np.zeros(len(true_counts))
    
    for attr, count in zip(unique, counts):
        pos = np.where(true_unique == attr)[0][0]
        sample_counts[pos] = count
        
    scaled_counts = len(df)*sample_counts/len(subset)
    if not silent:
        print(scaled_counts)
        print(true_counts)
    return chisquare(scaled_counts, true_counts)

def filter_significant_x2(df, cls, alpha):
    col_list = []
    columns = list(df.columns)
    columns.remove(cls)
#     print(columns)
    
    for col in columns:
        result0 = calc_chisquare(df, col, cls, 0, silent=True)
        result1 = calc_chisquare(df, col, cls, 1, silent=True)
        
        if result0.pvalue <= alpha or result1.pvalue <= alpha:
            col_list.append(col)
    
    return df[col_list]

def numerify(x):
    if x == 'positive':
        return 1
    if x == 'negative':
        return -1
    else:
        return 0

In [14]:
continuous_cols = ['Followers',
                 'Followees',
                 'Age_account',
                 'Total_tweets',
                 'Favourite', 
                 'Groups', 
                 'Aver_favourite', 
                 'Length_tweet',
                 'Aver_tweets',
                 'Name_length',
                 '@@class@@']

continuous = df[continuous_cols]
continuous_cols.remove('@@class@@')
discrete = df.drop(continuous_cols, axis=1)

In [15]:
significant_continuous = filter_significant_t(continuous, '@@class@@', 0.05)

In [16]:
significant_discrete = filter_significant_x2(discrete, '@@class@@', 0.05)

In [17]:
significant_discrete

Unnamed: 0,Has_location,Has_username,Has_Image,Contain_URL,Sent_level,Has_Uword,Posted_holiday,Has_number,Has_rt,Has_org,Has_tvshow,Posted_noon,Posted_weeke,Posted_eve,Has_excl,Num_hashtag,Opt_len,Has_suggest,Has_video
0,0,0,1,1,negative,0,1,0,0,0,0,0,1,0,0,0.0,0,0,0
1,0,0,0,0,positive,0,1,1,0,0,0,0,1,0,0,0.0,0,0,0
2,1,0,0,1,negative,0,1,0,0,0,0,0,1,1,0,0.0,0,0,0
3,0,0,0,1,negative,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0
4,0,0,0,0,negative,0,1,0,0,0,0,0,1,0,1,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6043131,0,0,1,1,positive,0,0,0,0,0,0,0,1,0,1,0.0,0,0,0
6043132,0,0,0,0,positive,0,0,1,0,0,0,0,1,0,1,0.0,1,0,0
6043133,0,0,0,0,negative,0,0,0,0,0,0,0,1,0,0,0.0,0,0,0
6043134,0,0,0,1,negative,0,0,0,0,0,0,0,1,0,0,0.0,0,0,0


In [18]:
significant_continuous

Unnamed: 0,Followers,Followees,Age_account,Total_tweets,Favourite,Groups,Aver_favourite,Length_tweet,Aver_tweets,Name_length
0,39113.0,10.0,421.0,1221.0,145.0,1489.0,0.34,111.0,2.90,11.0
1,663748.0,28.0,2025.0,4658.0,3.0,631.0,0.00,45.0,2.30,16.0
2,1836.0,597.0,242.0,272.0,0.0,3.0,0.00,156.0,1.12,6.0
3,3494.0,4617.0,226.0,1548.0,711.0,29.0,3.15,74.0,6.85,15.0
4,523.0,714.0,375.0,5715.0,3201.0,1.0,8.54,42.0,15.24,8.0
...,...,...,...,...,...,...,...,...,...,...
6043131,2.0,796.0,8.0,23.0,9.0,0.0,1.13,77.0,2.88,5.0
6043132,2087.0,648.0,2451.0,163551.0,249632.0,7.0,101.85,85.0,66.73,13.0
6043133,21.0,18.0,1271.0,1677.0,48.0,1.0,0.04,44.0,1.32,6.0
6043134,4.0,54.0,1206.0,265.0,1.0,0.0,0.00,49.0,0.22,18.0


In [19]:
Sent_level = significant_discrete['Sent_level'].apply(numerify)
significant_discrete.drop('Sent_level', axis=1)
significant_discrete = significant_discrete.assign(Sent_level = Sent_level)

In [20]:
np.unique(significant_discrete['Sent_level'])

array([-1,  0,  1], dtype=int64)

In [21]:
significant = significant_discrete.join(significant_continuous)
significant.head()

Unnamed: 0,Has_location,Has_username,Has_Image,Contain_URL,Sent_level,Has_Uword,Posted_holiday,Has_number,Has_rt,Has_org,...,Followers,Followees,Age_account,Total_tweets,Favourite,Groups,Aver_favourite,Length_tweet,Aver_tweets,Name_length
0,0,0,1,1,-1,0,1,0,0,0,...,39113.0,10.0,421.0,1221.0,145.0,1489.0,0.34,111.0,2.9,11.0
1,0,0,0,0,1,0,1,1,0,0,...,663748.0,28.0,2025.0,4658.0,3.0,631.0,0.0,45.0,2.3,16.0
2,1,0,0,1,-1,0,1,0,0,0,...,1836.0,597.0,242.0,272.0,0.0,3.0,0.0,156.0,1.12,6.0
3,0,0,0,1,-1,0,0,0,0,0,...,3494.0,4617.0,226.0,1548.0,711.0,29.0,3.15,74.0,6.85,15.0
4,0,0,0,0,-1,0,1,0,0,0,...,523.0,714.0,375.0,5715.0,3201.0,1.0,8.54,42.0,15.24,8.0


In [25]:
X, y = significant, df['@@class@@']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)

In [31]:
X_train.shape

(4834508, 29)

In [32]:
X_test.shape

(1208628, 29)

In [33]:
y_train.shape

(4834508,)

In [34]:
y_test.shape

(1208628,)

# Model

In [39]:
input_shape = (X.shape[1],)
batch_size = 128
learning_rate = 1e-3
epochs = 10

In [36]:
model = keras.Sequential([
    layers.Dense(128, input_shape=input_shape, activation=tf.nn.relu),
    layers.Dense(128, activation=tf.nn.relu),
    layers.Dense(1)
])

In [37]:
model.compile(
    loss = losses.BinaryCrossentropy(from_logits=True),
    optimizer = optimizers.Adam(learning_rate),
    metrics = [metrics.BinaryAccuracy()]
)

In [40]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23cac39a230>

In [41]:
model.evaluate(X_test, y_test)



[0.4396185278892517, 0.7904946804046631]