In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data.csv', encoding='utf-8', header=None, names=['target', 'ids', 'date', 'flag', 'user', 'text'])
data.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [3]:
from sklearn.utils import shuffle

In [5]:
data = shuffle(data, random_state=42).reset_index(drop=True)
data

Unnamed: 0,target,text
0,0,internet is slow on me today likes
1,4,"@tooclevername to those with hammers, everythi..."
2,0,These kids are crazy save me
3,0,my puppy libby scratched my heel and now it's...
4,4,"okay, my ipod has been synced. FINALLY. so hap..."
...,...,...
1599995,4,At a rooftop party ln brooklyn! Hey at least l...
1599996,0,"@mobilephone2003 Only for US, UK and Canada"
1599997,4,"@AureliusTjin Haha, yeah. Your name is very u..."
1599998,0,I lot DT by less then 15 points again at lea...


In [None]:
avg = data['text'].mean()
data = data[data['text'] <= avg]

In [6]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
#from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [7]:
df = data

In [8]:
from sklearn.model_selection import train_test_split


In [9]:
# Convert 'target' to bnary sentiment labels (0 or 1)
df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert data to TensorFlow datasets
train_data = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['target'].values))
test_data = tf.data.Dataset.from_tensor_slices((test_df['text'].values, test_df['target'].values))


In [None]:
data.to_csv('data_outliersremoved.csv')

In [27]:
bert_model_url = "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2"  # SST-2 model trained for sentiment analysis
preprocessor_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

bert_preprocessor = hub.KerasLayer(preprocessor_url)
bert_encoder = hub.KerasLayer(bert_model_url)


In [38]:
def build_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocessor(text_input)
    outputs = bert_encoder(preprocessed_text)

    # Extract the pooled output from the BERT encoder
    net = outputs['pooled_output']
    
    # Add dropout for regularization
    net = tf.keras.layers.Dropout(0.1)(net)
    
    # Add additional dense layers with ReLU activation
    net = tf.keras.layers.Dense(128, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)  # Add dropout to the new dense layer
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(16, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    
    # Final output layer with sigmoid activation for binary classification
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    
    return tf.keras.Model(inputs=[text_input], outputs=[net])

# Initialize and build the model
model = build_model()

In [39]:
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [36]:
# Original shape
original_shape = df.shape
print("Original DataFrame shape:", original_shape)

# Calculate the new size (1/100 of the original)
new_size = original_shape[0] // 100  # integer division to get the whole number
print("New size for training data:", new_size)

# Randomly sample the training data
smaller_train_df = train_df.sample(n=new_size, random_state=42)

# Check the shape of the new training dataset
print("Smaller Training DataFrame shape:", smaller_train_df.shape)

smaller_train_data = tf.data.Dataset.from_tensor_slices((smaller_train_df['text'].values, smaller_train_df['target'].values))

smaller_test_df = test_df.sample(n=new_size, random_state=42)

# Check the shape of the new training dataset
print("Smaller Training DataFrame shape:", smaller_test_df.shape)

smaller_test_data = tf.data.Dataset.from_tensor_slices((smaller_test_df['text'].values, smaller_test_df['target'].values))

Original DataFrame shape: (1600000, 2)
New size for training data: 16000
Smaller Training DataFrame shape: (16000, 2)
Smaller Training DataFrame shape: (16000, 2)


In [44]:
train_data = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['target'].values))

In [47]:
BATCH_SIZE = 32
train_data =smaller_train_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_data = smaller_test_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

with tf.device('/GPU:0'):
    model.fit(train_data, epochs=5)


Num GPUs Available:  1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
loss, accuracy = model.evaluate(test_data)
print(f"Test Accuracy: {accuracy:.2f}")
    

Test Accuracy: 0.79


In [37]:
model.save('Test_79', include_optimizer=False)



In [None]:
# Load the model
loaded_model = tf.keras.models.load_model('Test_79', custom_objects={'KerasLayer': hub.KerasLayer})

-------
HYPERTUNNING
------

In [51]:
import tensorflow as tf
import tensorflow_hub as hub
import keras_tuner as kt

# Hypermodel definition
def build_hypermodel(hp):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocessor(text_input)
    outputs = bert_encoder(preprocessed_text)
    
    # BERT pooled output
    net = outputs['pooled_output']
    
    # Add dense layers with hyperparameter tuning
    for i in range(hp.Int("num_layers", 1, 3)):  # Tune 1 to 3 additional layers
        net = tf.keras.layers.Dense(
            units=hp.Choice(f"units_{i}", [32, 64, 128]),  # Tune size per layer
            activation='relu'
        )(net)
        net = tf.keras.layers.Dropout(hp.Float(f"dropout_{i}", 0.1, 0.5, step=0.1))(net)
    
    # Output layer
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    
    # Compile the model
    model = tf.keras.Model(inputs=[text_input], outputs=[net])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Choice("learning_rate", [1e-5, 3e-5, 1e-4])
        ),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Hyperparameter tuning setup
tuner = kt.Hyperband(
    build_hypermodel,
    objective='accuracy',
    max_epochs=10,
    factor=3,
    directory='hyperband_dir',
    project_name='text_sentiment_analysis'
)

# Run the search
tuner.search(train_data, epochs=5)

# Retrieve best model
best_model = tuner.get_best_models(num_models=1)[0]

# Summary of the best model
best_model.summary()


Trial 26 Complete [00h 11m 01s]
accuracy: 0.7975624799728394

Best accuracy So Far: 0.8276875019073486
Total elapsed time: 04h 50m 05s
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_8 (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':        