In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import os
import pickle
from tensorflow import keras

## Load data tweets data set

In [2]:
train_path = 'training_data/train.csv'
test_path = 'training_data/test.csv'
save_model_path = 'model_save'
tokenizer_path = 'output'

In [3]:
print('[!!] Loading data ...')
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv(
    train_path,
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
test_data = pd.read_csv(
    test_path,
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

[!!] Loading data ...


In [4]:
print('Train data head')
display(train_data.head())

Train data head


Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
print('Test data head')
display(test_data.head())

Test data head


Unnamed: 0,sentiment,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [6]:
data = train_data  # using only the train data set

## Preprocessing

### Cleaning

In [7]:
data.drop(["id", "date", "query", "user"],
          axis=1,
         inplace=True)

In [8]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

### Loading data_clean variable if exists else create one

In [9]:
data_clean_save = os.path.join(tokenizer_path, 'data_clean.pickle')
if os.path.exists(data_clean_save):
    print('[!!] Loading data clean')
    with open(data_clean_save, 'rb') as load_file:
        data_clean = pickle.load(load_file)
    print('[!!] Data_clean loaded [!!]')
else:
    print('[!!] Cleaning data using clean_tweet function ...')
    data_clean = [clean_tweet(tweet) for tweet in data.text]
    with open(data_clean_save, 'wb') as save_file:
        pickle.dump(data_clean, save_file)

[!!] Loading data clean
[!!] Data_clean loaded [!!]


In [10]:
print(data_clean[:5])
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1
print('Data labels ..')
print(data_labels)

[" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D", "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!", ' I dived many times for the ball. Managed to save The rest go out of bounds', 'my whole body feels itchy and like its on fire ', " no it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "]
Data labels ..
[0 0 0 ... 1 1 1]


### Loading tokenizer variable if exists

In [11]:
tokenizer_save = os.path.join(tokenizer_path, 'tokenizer.pickle')
if os.path.exists(tokenizer_save):
    print('[!!] Loading tokenizer data')
    with open(tokenizer_save, 'rb') as load_file:
        tokenizer = pickle.load(load_file)
    print('[!!] Tokenizer_data loaded')

[!!] Loading tokenizer data
[!!] Tokenizer_data loaded


### Loading data_inputs variable

In [12]:
data_inputs_save = os.path.join(tokenizer_path, 'data_inputs.pickle')
if os.path.exists(data_inputs_save):
    print('[!!] Loading data input')
    with open(data_inputs_save, 'rb') as load_file:
        data_inputs = pickle.load(load_file)
    print('[!!] data_inputs loaded')
else:
    print('[!!] Encoding data')
    data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]
    with open(data_inputs_save, 'wb') as save_file:
        pickle.dump(data_inputs, save_file)

[!!] Loading data input
[!!] data_inputs loaded


### Padding

In [13]:
print('[!!] Padding ..')
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

[!!] Padding ..


### Splitting the training/tsting set

In [14]:
print('[!!] Splitting the traning/testing set')
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx + 800000))
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

[!!] Splitting the traning/testing set


In [15]:
train_inputs.shape

(1584092, 73)

In [16]:
VOCAB_SIZE = tokenizer.vocab_size
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 1  # 5

In [17]:
train_inputs.shape

(1584092, 73)

## Define model

In [18]:
model = keras.Sequential()

In [19]:
model.add(keras.layers.Embedding(VOCAB_SIZE, EMB_DIM))

In [20]:
model.add(keras.layers.Conv1D(filters=NB_FILTERS, kernel_size=2, padding='valid', activation='relu'))

In [21]:
model.add(keras.layers.GlobalMaxPool1D())

In [22]:
model.add(keras.layers.Dense(units=FFN_UNITS, activation='relu'))

In [23]:
model.add(keras.layers.Dropout(rate=DROPOUT_RATE))

In [24]:
model.add(keras.layers.Dense(units=1, activation='sigmoid'))  # Last layer

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 200)         13108000  
_________________________________________________________________
conv1d (Conv1D)              (None, None, 100)         40100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               25856     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 13,174,213
Trainable params: 13,174,213
Non-trainable params: 0
____________________________________________

### Compile

In [26]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Fit

In [27]:
model.fit(train_inputs, train_labels, batch_size=BATCH_SIZE, epochs=NB_EPOCHS)

Train on 1584092 samples


<tensorflow.python.keras.callbacks.History at 0x13ede3f90>

In [28]:
test_inputs

array([[65316,  2398,    73, ...,     0,     0,     0],
       [ 4097,    41,    16, ...,     0,     0,     0],
       [65316,    78,     7, ...,     0,     0,     0],
       ...,
       [17038, 13189,     6, ...,     0,     0,     0],
       [65316,   389, 17678, ...,     0,     0,     0],
       [65316,   773,    39, ...,     0,     0,     0]], dtype=int32)

In [29]:
np.array([tokenizer.encode("You are so funny")]).shape

(1, 4)

In [46]:
model.predict(np.array([tokenizer.encode("I have excellent apartment")]))

array([[0.97075653]], dtype=float32)

In [31]:
train_inputs.shape

(1584092, 73)

In [32]:
test = np.array([tokenizer.encode("You are so funny")])

In [33]:
model.predict(test)

array([[0.94493103]], dtype=float32)

In [47]:
model.save('cnn_model.h5')

In [48]:
loaded_model = tf.keras.models.load_model('cnn_model.h5')

In [49]:
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 200)         13108000  
_________________________________________________________________
conv1d (Conv1D)              (None, None, 100)         40100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               25856     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 13,174,213
Trainable params: 13,174,213
Non-trainable params: 0
____________________________________________

In [52]:
loaded_model.predict(np.array([tokenizer.encode("we're close to the peak of coronavirus right? this shit sucks LOL")]))

array([[0.11510569]], dtype=float32)