# Corona Tweets Classification using Keras:

## Package & Libraries:

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

## Data reading

In [None]:
train = pd.read_csv('/Users/spavot/Documents/Perso/Text classification & Visualization/Data/Corona_NLP_train.csv', encoding = 'latin')
test = pd.read_csv('/Users/spavot/Documents/Perso/Text classification & Visualization/Data/Corona_NLP_test.csv', encoding ='latin')

## Data exploration:

In [None]:
train.info()

* We see that we have 41157 values but we have only 32k non null values for location, we will have to fix this:

* UserName and ScreenName are id related data, we won't use it


Let's see which location is the more popular:

In [None]:
location = train.Location
location = pd.DataFrame(location)
location['Count'] = 1
location = location.groupby('Location').sum().sort_values(by = 'Count', ascending = False).nlargest(15,['Count'])
location = location.reset_index()
plt.figure(figsize=(25,7))
sns.barplot(x = 'Count', y = 'Location', data = location)
plt.show()

We can see that we have some noises and some location are country where other are cities

Now we want to have a look of the distribution of our target variable:

In [None]:
plt.figure(figsize=(25,7))
sns.countplot(train.Sentiment)

Seems like the target variable distribution is not skewed and we doesn't risk to have a category which is never predicted due to the lack of presence.

Let's look at the distribution of tweets over time:

In [None]:
time = train.TweetAt
time = pd.DataFrame(time)
time['Count'] = 1
time = time.groupby('TweetAt').sum()
time = time.reset_index()
time = time.iloc[1:,:]
time['TweetAt'] = pd.to_datetime(time['TweetAt'], format = '%d-%m-%Y')
plt.figure(figsize=(25,7))
sns.lineplot(x = 'TweetAt', y = 'Count', data = time)
plt.xticks(rotation=45)
plt.show()

In [None]:
Min = time.TweetAt.min()
Max = time.TweetAt.max()
print(f'The date range of the data is between {Min} and {Max}')

Seems like we have some day without data and some with a lot of tweets. The tweets are spread between 16 of March to 14 of April

Finally, let's analyze the lengths of tweets:

In [None]:
length_tweets = pd.DataFrame(train.OriginalTweet)
length_measured = []
for i in length_tweets.OriginalTweet:
 length_measured.append(len(i))

plt.figure(figsize=(25,7))
sns.distplot(length_measured)

It seems that the repartition is pretty well distributed, we can see a raise at 260 characters but overall we have tweets of all size. Note that we will have to check the length after processing the tweets

## Data cleaning:

### Drop variables

Let's first drop UserName and ScreenName as they are only id variables so we won't use them:

In [None]:
train = train.drop(['UserName','ScreenName'], axis = 1)
test = test.drop(['UserName','ScreenName'], axis = 1)

### Location variable

First, we will replace "NA" values with "Unknown"

In [None]:
train.Location = train.Location.fillna('Unknown')

Now we import a dataframe containg major cities in the word and countries in order join on it:

In [None]:
city_country = pd.read_csv('/Users/spavot/Documents/Perso/Text classification & Visualization/Data/world-cities.csv')

In [None]:
city_list = []

# for i in train.Location:
#     for y in city_country.name:
#         if y.lower() in i.lower():
#             city_name = y
#         else:
#             city_name = 'Unknown'
#     city_list.append(city_name)

In [None]:
# city_list = pd.DataFrame(city_list)
# city_list.value_counts()

We transform the TweetAt column into time variable:

### TweetAt Variable

In [None]:
train['TweetAt'] = pd.to_datetime(train['TweetAt'], format = '%d-%m-%Y')
test['TweetAt'] = pd.to_datetime(test['TweetAt'], format = '%d-%m-%Y')

### Original tweet cleaning

Now we will start fixing the tweets, we need to remove the punctuation and specific characters etc.. 

In [None]:
train.OriginalTweet.head(10)

In [None]:
#Transform to lower:
train.OriginalTweet = train.OriginalTweet.str.lower()
test.OriginalTweet = test.OriginalTweet.str.lower()

In [None]:
#Remove urls:
train.OriginalTweet = train.OriginalTweet.str.replace('http\S+|www.\S+', '', case=False)
test.OriginalTweet = test.OriginalTweet.str.replace('http\S+|www.\S+', '', case=False)

In [None]:
#Extract hastags & append them into a new column:
for data in train, test:
    Hashtags = []
    for i in range(0,len(data.OriginalTweet)):
        if len(re.findall(r"#(\w+)", data.OriginalTweet[i]))>0:
            Hashtags.append(re.findall(r"#(\w+)", data.OriginalTweet[i]))
        else:
            Hashtags.append('None')
    data['Hashtags'] =  Hashtags

In [None]:
train.head()

In [None]:
#Replace hashtags text now that we extracted it
train.OriginalTweet = train.OriginalTweet.apply(lambda x: ' '.join([word for word in x.split() if word[0] != '#']))
test.OriginalTweet = test.OriginalTweet.apply(lambda x: ' '.join([word for word in x.split() if word[0] != '#']))

In [None]:
#Remove punctuation, special characters & mentions:
train.OriginalTweet = train.OriginalTweet.str.replace(r'[^\w\s]', '', case=False)
test.OriginalTweet = test.OriginalTweet.str.replace(r'[^\w\s]', '', case=False)

In [None]:
#Remove stopwords:
stop_words = set(stopwords.words('english'))
train.OriginalTweet = train.OriginalTweet.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test.OriginalTweet = test.OriginalTweet.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
#Remove non alphabetic words:
train.OriginalTweet = train.OriginalTweet.apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))
test.OriginalTweet = test.OriginalTweet.apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))

In [None]:
#Remove emptys rows:
train = train[train.OriginalTweet != '']
test = test[test.OriginalTweet != '']

Let's check if the cleaning seems okay:

In [None]:
for i in range(0,5):
    print(i,':',train.OriginalTweet[i])
    print(i,':',test.OriginalTweet[i])

In [None]:
train.head()

In [None]:
test.head()

It seems 👌, now let's go into the data preparation for our model:

## Data Preparation for the model:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
#Encode the text into number using Count Vectorizer from sickitlearn:
train_text = train.OriginalTweet.values
test_text = test.OriginalTweet.values

vectorizer = CountVectorizer()
vectorizer.fit(train_text)

In [None]:
#Create training / test set:
X_train = vectorizer.transform(train_text)
X_test  = vectorizer.transform(test_text)
y_train = pd.get_dummies(train.Sentiment).values
y_test = pd.get_dummies(test.Sentiment).values


## Modeling Deep Neural Network with Keras

### Simple one layer Model:

In [None]:
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [None]:
model_simple = Sequential()
model_simple.add(Dense(30, input_dim = X_train.shape[1], activation = 'relu'))
model_simple.add(Dense(5, activation = 'softmax'))

In [None]:
model_simple.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model_simple.summary()

In [None]:
history_simple = model_simple.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=25)

We now initiate a function to plot the learning evolution of our first model:

In [None]:
def plot_learning_curves(history, model):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.lineplot(x, acc, label='Training acc')
    sns.lineplot(x, val_acc, label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    sns.lineplot(x, loss, label='Training loss')
    sns.lineplot(x, val_loss, label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
plot_learning_curves(history_simple, model_simple)

Seems like our model overfit really fast, we end with a test score of 0.62 where we have 0.98 with training data, let's try different types of model and see if we can increase performance.

### Multi layers model:

In [None]:
multi_model = Sequential()
multi_model.add(Dense(64, input_dim = X_train.shape[1], activation = 'relu'))
multi_model.add(Dense(32, activation = 'relu'))
multi_model.add(Dense(16, activation= 'relu'))
multi_model.add(Dense(5, activation = 'softmax'))

In [None]:
multi_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
multi_model.summary()

In [None]:
history_Multi = multi_model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=25)

In [None]:
plot_learning_curves(history_Multi, multi_model)

We improved the score with more layers, it looks like with more layers the model can learn more from the data but we still suffer from an overfitting really fast, let's try to change the way we encoded the data.

### Words embeddings to improve the model:

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Embedding, Flatten, GlobalMaxPool1D, Conv1D

In [None]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(train_text)
X_train = tokenizer.texts_to_sequences(train_text)
X_test = tokenizer.texts_to_sequences(test_text)

vocab_size = len(tokenizer.word_index)+1
print(train_text[1])
print(X_train[1])

In [None]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
embedding_dim = 50

multi_model_Embed = Sequential()
multi_model_Embed.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
multi_model_Embed.add(Flatten())
multi_model_Embed.add(Dense(64, activation = 'relu'))
multi_model_Embed.add(Dense(32, activation = 'relu'))
multi_model_Embed.add(Dense(16, activation = 'relu'))
multi_model_Embed.add(Dense(5, activation = 'softmax'))

In [None]:
multi_model_Embed.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
multi_model_Embed.summary()

In [None]:
history_Multi_Embed = multi_model_Embed.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [None]:
plot_learning_curves(history_Multi_Embed, multi_model_Embed)

Add GlobalMaxPool:

In [None]:
multi_model_Embed_Max = Sequential()
multi_model_Embed_Max.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
multi_model_Embed_Max.add(GlobalMaxPool1D())
multi_model_Embed_Max.add(Dense(64, activation = 'relu'))
multi_model_Embed_Max.add(Dense(32, activation = 'relu'))
multi_model_Embed_Max.add(Dense(16, activation = 'relu'))
multi_model_Embed_Max.add(Dense(5, activation = 'softmax'))

In [None]:
multi_model_Embed_Max.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
multi_model_Embed_Max.summary()

In [None]:
Multi_Embed_Max = multi_model_Embed_Max.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [None]:
plot_learning_curves(Multi_Embed_Max, multi_model_Embed_Max)

Convolutional Neural Networks:

In [None]:
model_Conv = Sequential()
model_Conv.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model_Conv.add(Conv1D(128, 5, activation='relu'))
model_Conv.add(GlobalMaxPool1D())
model_Conv.add(Dense(64, activation = 'relu'))
model_Conv.add(Dense(32, activation = 'relu'))
model_Conv.add(Dense(16, activation = 'relu'))
model_Conv.add(Dense(5, activation = 'softmax'))

In [None]:
model_Conv.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model_Conv.summary()

In [None]:
history_Conv = model_Conv.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [None]:
plot_learning_curves(history_Conv, model_Conv)

Hyperparameters tuning:

In [None]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model_Conv = Sequential()
    model_Conv.add(Embedding(input_dim=vocab_size, 
                            output_dim=embedding_dim, 
                            input_length=maxlen))
    model_Conv.add(Conv1D(num_filters, kernel_size, activation='relu'))
    model_Conv.add(GlobalMaxPool1D())
    model_Conv.add(Dense(64, activation = 'relu'))
    model_Conv.add(Dense(32, activation = 'relu'))
    model_Conv.add(Dense(16, activation = 'relu'))
    model_Conv.add(Dense(5, activation = 'softmax'))
    model_Conv.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

    return model_Conv

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[45929], 
                  embedding_dim=[50],
                  maxlen=[100])

In [None]:
epochs = 10

model_grid = KerasClassifier(build_fn=create_model,
                            epochs=epochs, batch_size=10,
                            verbose=True)
grid = RandomizedSearchCV(estimator=model_grid, param_distributions=param_grid,
                              cv=4, verbose=2, n_iter=5, n_jobs=-1)

grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)