In [1]:
# Import package
import pandas as pd
import numpy as np
# 
path = '../Data/'

In [2]:
df = pd.read_csv(path + 'df_cleaned.csv')

In [3]:
# checking for nulls
df.isnull().sum()

target           0
ids              0
tweet_date       0
flag             0
user             0
text             0
text_clean    8258
dtype: int64

In [4]:
# Remove Null Tweets
df.dropna(inplace=True)

In [5]:
# Label ONLY has Positive(1) or Negative(0) on the target field so, this exercise is a binary classification problem.
# Balanced Dataset assumption made
df.target.value_counts()

0    796361
1    795381
Name: target, dtype: int64

In [6]:
pd.set_option("display.max_colwidth", 0)
df[['target','text', 'text_clean']].sample(5)

Unnamed: 0,target,text,text_clean
565278,0,"@Maggadoo You liar, your last words were &quot;Can I call u later?&quot; and I said YES. Its all good dont worry",liar last word quot cal lat quot said ye good dont worry
221152,0,"@OfficialBabyV whatever anyone else is saying, i'm going to miss your tweets! come back to us soon girl!",whatev anyon els say going miss tweet com back us soon girl
1249139,1,"@heykim Kim, you are Tweeter of the year! Seriously.",kim tweet year sery
696574,0,Sick teen = crazy sleepness night. Not quite sure how we're going to get back home without an incident at this point. Poor girl. #fb,sick teen crazy sleep night not quit sur going get back hom without incid point poor girl
620545,0,"@calebfox yeah, probably",yeah prob


## User Function

In [7]:
def scoring_model(model, X_train, X_test, y_train, y_test, y_pred):
    # Import
    from sklearn.metrics import accuracy_score, auc, roc_auc_score, roc_curve
    #
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, 
                                                                    model.predict_proba(X_train)[:,1])
    #
    tr_score = model.score(X_train, y_train)
    ts_score = model.score(X_test, y_test)
    acc = accuracy_score(y_test, y_pred)
    auc = auc(false_positive_rate, true_positive_rate)
    roc_tr = roc_auc_score(y_train, model.predict(X_train))
    roc_t = roc_auc_score(y_test, model.predict(X_test))
    return tr_score,ts_score,acc,auc,roc_tr,roc_t

## Vectorizer

In [8]:
# import library
from sklearn.model_selection import train_test_split
# Getting tokenization of tweet text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
# 
t_size = 0.20
seed = 11
# 

# TfidfVectorizer: unigrams and bigrams
parms = {'max_df' : 0.995,
           'min_df': 0.001,
           'ngram_range' : (1,2),
          }

### Tfidf Vectorizer with unigrams and bigrams

In [9]:
# incode Tfidf Vectorizer
tv = TfidfVectorizer(**parms)
# 
X = tv.fit_transform(df.text_clean).toarray()
y = df.target
# 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=t_size, random_state=seed)
# 
print('Tfidf Vectorizer shape: ', X.shape, )
# 

Tfidf Vectorizer shape:  (1591742, 1163)


### Neural Network with Tfidf Vectorizer

In [10]:
from sklearn.neural_network import MLPClassifier
# 
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
# 
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# 

#### Tuning hidden layer & learning rate for keras

In [11]:
features = X_train.shape[1]
# 
def model_builder(hp):
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=(features,1)))

    # Tune the number of units in the first Dense layer
    # Choose an optimal value between 32-512
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(keras.layers.Dense(10))

    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.5, 0.1, 0.01, 0.001, or 0.00011
    hp_learning_rate = hp.Choice('learning_rate', values=[5e-1, 1e-1, 1e-2, 1e-3, 1e-4])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                metrics=['accuracy'])

    return model
# 
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='tmp',
                     project_name='keras_tuning')
# 
# Create a callback to stop training early after reaching a certain value for the validation loss.
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
# 
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
# 
layer = (best_hps.get('units'))
learning_rate = (best_hps.get('learning_rate'))
#

Trial 30 Complete [00h 08m 07s]
val_accuracy: 0.7758943438529968

Best val_accuracy So Far: 0.7760632038116455
Total elapsed time: 01h 45m 12s
INFO:tensorflow:Oracle triggered exit


#### Keras Classifier with Tfidf Vectorizer

In [12]:
# Function to create model, required for KerasClassifier
def create_model_(optimizer='adam', init='glorot_uniform'):
    # create model
    model = Sequential()
    model.add(Dense(layer, input_dim=features, activation='relu', kernel_initializer=init))
    model.add(Dense(1, activation='sigmoid'))
    opt = keras.optimizers.Adam(learning_rate=learning_rate)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model
# 
# create model
model_k = KerasClassifier(build_fn=create_model_, verbose=1)

%time model_k.fit(X_train, y_train, validation_split=0.20, epochs=15, batch_size=10)
# 
y_pred = model_k.predict(X_test)
# 
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(model_k, X_train, X_test, y_train, y_test, y_pred)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CPU times: user 1h 41min 3s, sys: 25min 39s, total: 2h 6min 42s
Wall time: 35min 58s
[[123178  36223]
 [ 34190 124758]]


Training set score:         0.7971286178
Test set score:             0.7788181901
Accuracy Test set Score:    0.7788182152
AUC Score:                  0.8779741031
ROC AUC Training set Score: 0.7971309979
ROC AUC Test set Score:     0.7788268546


#### MLP Classifier

In [13]:
# import sklearn.neural_network
# 
# parms = {'hidden_layer_sizes': (100,),
#          'activation': 'logistic',    # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’
#          'solver': 'sgd',             # {‘lbfgs’, ‘sgd’, ‘adam’}
#          'alpha': 0.0001,             # L2 penalty (regularization term) parameter
#          'batch_size': 'auto',
#          'learning_rate': 'constant', # ‘constant’ is a constant learning rate given by ‘learning_rate_init’.
#                                       # {‘constant’, ‘invscaling’, ‘adaptive’}
#          'learning_rate_init': 0.001, # The initial learning rate used. It controls the step-size in updating the 
#                                       # weights. 
#                                       # Only used when solver=’sgd’ or ‘adam’.
#          'power_t': 0.5,              # The exponent for inverse scaling learning rate. It is used in updating  
#                                       # effective learning rate when the learning_rate is set to ‘invscaling’. 
#                                       # Only used when solver=’sgd’.
#          'max_iter': 1000,
#          'shuffle': True,
#          'random_state': seed,
#          'tol': 0.0001,
#          'verbose': False,
#          'warm_start': False,
#          'momentum': 0.9,             # Momentum for gradient descent update. Should be between 0 and 1. 
#                                       # Only used when solver=’sgd’.
#          'nesterovs_momentum': True,
#          'early_stopping': False,
#          'validation_fraction': 0.1,  # The proportion of training data to set aside as 
#                                       # validation set for early stopping. Must be between 0 and 1. 
#                                       # Only used if early_stopping is True.
#          'beta_1': 0.9,               # Exponential decay rate for estimates of first moment vector in adam, should 
#                                       # be in [0, 1). Only used when solver=’adam’.
#          'beta_2': 0.999,             # Exponential decay rate for estimates of second moment vector in adam, should 
#                                       # be in [0, 1). Only used when solver=’adam’.
#          'epsilon': 1e-08,            # Value for numerical stability in adam. Only used when solver=’adam’.
#          'n_iter_no_change': 10,      # Maximum number of epochs to not meet tol improvement. Only effective 
#                                       # when solver=’sgd’ or ‘adam’.
#         }
# 
# using Hidden layer & Learning rate from keras optimizer
parms = {'hidden_layer_sizes': (layer,),
         'activation': 'logistic',
         'solver': 'adam',
         'learning_rate_init': learning_rate,
         'random_state': seed,
        }
# Create a model Tfidf 
mlp_tfidf = MLPClassifier(**parms)
# Train the model on the train data set
%time mlp_tfidf.fit(X_train, y_train)
# Evaluate on test data
y_pred = mlp_tfidf.predict(X_test)
# 
print('\n')
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(mlp_tfidf, X_train, X_test, y_train, y_test, y_pred)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 

CPU times: user 2h 8min 58s, sys: 24min 4s, total: 2h 33min 2s
Wall time: 19min 15s


[[117730  41671]
 [ 33019 125929]]


Training set score:         0.7646563158
Test set score:             0.7653832743
Accuracy Test set Score:    0.7653832743
AUC Score:                  0.8447723594
ROC AUC Training set Score: 0.7646672616
ROC AUC Test set Score:     0.7654214723
