<h1 style="text-align:center; background-color:#4DBDE3; font-weight:bold; color:white;border-radius: 50px 15px">Coronavirus tweets</h1>

**To review different concepts in NLP, I looked at different notebooks and I really liked this one :**
* https://www.kaggle.com/andreshg/commonlit-a-complete-analysis

**Go take a look if you have time !**

<a id='table_contents'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
   📋&ensp; Table of Contents &ensp;📋 
</h1>

* **[Import libraries](#libraries)**
* **[Load data](#load_data)**
* **[EDA](#eda)**
    * [Missing values](#missing_val)
    * [Target visualization](#target_vis)
    * [Variables visualization](#var_vis)
* **[Data Preprocessing](#data_preprocessing)**
    * [Pretreatment / Corpus cleaning](#pretreatment)
    * [Target processing](#target_process)
* **[Tokens visualization](#tokens_vis)**
    * [Top Words](#top_words)
    * [WordCloud](#wordcloud)
* **[Clustering](#clustering)**
    * [KMeans](#kmeans)
    * [Latent semantic analysis (LSA)](#lsa)
    * [Latent Dirichlet Allocation (LDA)](#lda)
* **[Vectorization](#vectorization)**
    * [Count Vectorizer](#countvect)
    * [TF-IDF Vectorizer](#tfidfvect)
    * [Continuous Bag of Word (CBOW)](#cbow)
    * [Skip-Gram (SG)](#sg)
    * [Test learnt embeddings](#learn_embeddings)
* **[Modeling](#modeling)**
    * [Class rebalancing (Re-sampling)](#class_rebalancing)
    * [XGBoost](#xgboost)
    * [Linear Support Vector Classification](#linsvc)
    * [LSTM](#lstm)

<a id='libraries'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
    Import libraries
</h1>

In [None]:
# Visualizations
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

# Plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Text manipulations
import re
import string
import unicodedata

# Natural Language Toolkit
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

# WordCloud
from PIL import Image
from wordcloud import WordCloud

# Scikit-learn
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

#XGBoost model
import xgboost as xgb

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models

# Imbalanced-Learn Library
from imblearn.under_sampling import RandomUnderSampler, AllKNN, NearMiss
from imblearn.over_sampling import RandomOverSampler, BorderlineSMOTE , SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

# Keras
from keras.preprocessing.text import Tokenizer
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import (LSTM, Embedding, BatchNormalization, Dense, 
                        Dropout, Bidirectional, GlobalMaxPool1D)

# Warning library
import warnings
warnings.filterwarnings("ignore")

# Others
import time
from pprint import pprint
from collections import Counter

# Some colors for prints
class print_color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

<a id='load_data'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
    Load data
</h1>

In [None]:
train = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv")
test = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv")
print(f"Train shape : {train.shape}\nTest shape  : {test.shape}")

In [None]:
train.head()

In [None]:
test.head()

[Back to table of contents](#table_contents)

<a id='eda'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
   EDA
</h1>

<a id='missing_val'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Missing Values
</h2>

### Train

In [None]:
print(train.isna().sum())
msno.bar(train, color=px.colors.qualitative.D3[0], sort="ascending", figsize=(10,5), fontsize=13)
plt.show()

### Test

In [None]:
print(test.isna().sum())
msno.bar(test, color=px.colors.qualitative.D3[2], sort="ascending", figsize=(10,5), fontsize=13)
plt.show()

Some values are missing in the **Location** columns, but these columns will not be important to us.

<a id='target_vis'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Target visualization
</h2>

In [None]:
val_counts = train["Sentiment"].value_counts()

fig = make_subplots(
        rows=1, cols=2,
        specs=[[{"type": "pie"}, {"type": "bar"}]],
        subplot_titles=("Pie", "Bar")
    )

fig.add_trace(
     px.pie(val_counts, 
            names=val_counts.index, 
            values=val_counts.values,
            hole=0.4,
            opacity=0.9
    ).data[0],
     row=1, col=1
    
).update_traces(
    textposition='inside', textinfo='percent+label'
).update_layout(
    title=dict(text='<b>Sentiment distribution<b>', x=0.48), 
    legend=dict(x=0, y=-0.05, orientation='h')
)

fig.add_trace(
     go.Bar(x=val_counts.index, 
            y=val_counts.values,
            marker_color=px.colors.qualitative.Set2,
            showlegend=False
    ),
     row=1, col=2
)

fig['layout']['annotations'][0].update(text='Sentiment', x=0.225, y=0.47, showarrow=False, font_size=15, opacity=0.9)

fig.show()

As we can see, this dataset is **unbalanced**.

<a id='var_vis'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Variables visualization
</h2>

### Location

In [None]:
#20 locations that appear the most
counts = train["Location"].value_counts().sort_values(ascending=False)[:20]

df = train.loc[train["Location"].isin(counts.index)]
df = df.groupby(by=["Location","Sentiment"])["OriginalTweet"].count().reset_index(name="Count")

fig = px.bar(df, x='Location', y='Count', color='Sentiment', 
             title="Top 20 <b>locations that appear the most<b>", 
             color_discrete_sequence=["#F96C6C", "#F9B06C", "#F9F96C", "#AFF96C", "#3DF961"],
             category_orders={
                 "Location": counts.index, 
                 "Sentiment": ["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"]
             }
        )

fig.show()

Tweets are generally from **London** and the **United States**.

### TweetAt

In [None]:
counts = pd.to_datetime(train["TweetAt"]).value_counts().sort_index()
fig = px.bar(counts, x=counts.index, y=counts.values, title="<b>Number of tweets by date<b>")
fig.update_layout(xaxis_title="Date", yaxis_title="Number of tweets")
fig.show()

We can see that many tweets date from **May**.

### OriginalTweet

In [None]:
df = train.copy()
df['tweet_len'] = df["OriginalTweet"].apply(lambda x : len(x))
df['tweet_word_count'] = df["OriginalTweet"].apply(lambda x : len(x.split(' ')))
df.head()

In [None]:
fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=("Distribution of <b>tweets length<b>", "Distribution of the <b>number of words<b>"))

displot1 = ff.create_distplot(
    [df['tweet_len']], 
    ['tweet_len'], 
    bin_size=10, 
    show_rug=False,
    colors=['#7FA6EE']
)

displot2 = ff.create_distplot(
    [df['tweet_word_count']], 
    ['tweet_word_count'], 
    bin_size=10, 
    show_rug=False,
    colors=['#EFD07F']
)

# tweet_len distribution plot
fig.add_trace(
    go.Histogram(displot1['data'][0]), 
    row=1, col=1
)

fig.add_trace(
    go.Scatter(displot1['data'][1], line=dict(color='blue', width=0.5)), 
    row=1, col=1
)

# tweet_word_count distribution plot
fig.add_trace(
    go.Histogram(displot2['data'][0]), 
    row=1, col=2
)

fig.add_trace(
    go.Scatter(displot2['data'][1], line=dict(color='orange', width=0.5)), 
    row=1, col=2
)

fig.update_xaxes(title_text = "Length", row=1, col=1)
fig.update_xaxes(title_text = "Number", row=1, col=2)
fig.update_layout(showlegend=False)

fig.show()

[Back to table of contents](#table_contents)

<a id='data_preprocessing'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
   Data Preprocessing
</h1>

<a id='pretreatment'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Pretreatment / Corpus cleaning
</h2>

In [None]:
def pretreatment(text, lang='english', keepStopWords=set(), stem=True, lemma=False, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    #Make text lowercase
    text = text.lower()
    #Remove text in square brackets
    text = re.sub('\[.*?\]', '', text)
    #Remove links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    #Remove punctuations
    punc = string.punctuation  # !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
    punc += '\n\r\t'
    text = text.translate(str.maketrans(punc, ' ' * len(punc)))
    #Remove numbers
    text = re.sub('[0-9]+', '', text)
    #Removal of accents and non-standard characters
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    
    #Lemmatization
    if lemma :
        tokens = word_tokenize(text)
        doc = nlp(" ".join(tokens)) 
        text = ' '.join([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    #Stemming    
    if stem :
        stemmer = SnowballStemmer(lang)
        tokens = word_tokenize(text)
        text = ' '.join([stemmer.stem(item) for item in tokens])
    
    #Remove stop words
    stopwords_list = set(stopwords.words(lang)) - keepStopWords
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word not in stopwords_list])
    
    return text

### Lemmatizer
* **[spacy.load()](https://spacy.io/api/top-level#spacy.load)**
> Load a pipeline using the name of an installed package, a string path or a Path-like object.

**Useful links :** 
* https://spacy.io/usage/processing-pipelines
* https://spacy.io/api/token#attributes

In [None]:
import spacy
!python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner']) # for lemmatization

### Test function

In [None]:
text = """   \tHi I will test my CLEANING FUNCTION.\n\n
[I want to thank you for taking your time to read this notebook] \n
* Here is the link to this notebook : https://www.kaggle.com/pascalzg/coronavirus-tweets-nlp !!
What do you think of this notebook? ?? =)\n
It's 7:09 pm, I have to take a break to go eat.\n
Here are some French words with accents: J'ai mangé une baguette et c'était très bon !! miaaam
α β γ δ ε  ζη θ
I ate cheese.
Is what I write amazing, incredible, unmissable ? I hope so because that is the end of this text HAHA.
"""

print(f"{print_color.BOLD}{print_color.UNDERLINE}Original text :{print_color.END}\n\n{text}\n")
print(f"{print_color.BOLD}{print_color.UNDERLINE}Cleaned text :{print_color.END}\n\n{pretreatment(text, stem=True, lemma=False)}")

**It seems that our function works !**

### Cleaning tweets

In [None]:
keep_words = {'not', 'could', 'would'} # Can be useful in Sentiment Analysis

df['tweet_clean'] = df['OriginalTweet'].apply(pretreatment, keepStopWords=keep_words, stem=True, lemma=False)
test['tweet_clean'] = test['OriginalTweet'].apply(pretreatment, keepStopWords=keep_words, stem=True, lemma=False)

df[["OriginalTweet", "tweet_clean"]].head()

<a id='target_process'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Target Processing
</h2>

To make the prediction easier, we will reduce the number of classes for our target by renaming some classes as :
* Extremely Positive -> Positive
* Extremely Negative -> Negative

In [None]:
replace_map = {"Sentiment": {"Extremely Positive" : "Positive", 
                             "Extremely Negative" : "Negative"} 
              }

df["target"] = pd.DataFrame(df["Sentiment"]).replace(replace_map)
test["target"] = pd.DataFrame(test["Sentiment"]).replace(replace_map)

print(f"{df['Sentiment'].value_counts()}\n\n{df['target'].value_counts()}")

In [None]:
sentiment_counts = df["Sentiment"].value_counts()
target_counts = df["target"].value_counts()

fig = make_subplots(
        rows=1, cols=2,
        specs=[[{"type": "bar"}, {"type": "bar"}]],
        subplot_titles=("Sentiment distribution", "Target distribution")
    )

fig.add_trace(
     go.Bar(x=sentiment_counts.index, 
            y=sentiment_counts.values,
            marker_color=['#37E984','#F89898','#F4F77F','#4DC215','#E73F3F'],
            showlegend=False
    ),
     row=1, col=1
)

fig.add_trace(
     go.Bar(x=target_counts.index, 
            y=target_counts.values,
            marker_color=['#37E984','#F89898','#F4F77F'],
            showlegend=False
    ),
     row=1, col=2
)

fig.show()

* The target column is still **not balanced**.
* Now, let's **encode** our target column.

In [None]:
target_map = {"Negative" : 0, "Neutral" : 1, "Positive" : 2}

df["encoded_target"] = df['target'].map(target_map)
test["encoded_target"] = test['target'].map(target_map)

df[["target", "encoded_target"]].head()

In [None]:
X_train = df["tweet_clean"]
y_train = df["encoded_target"]
X_test = test["tweet_clean"]
y_test = test["encoded_target"]

[Back to table of contents](#table_contents)

<a id='tokens_vis'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
   Tokens visualization
</h1>

<a id='top_words'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Top Words 
</h2>

In [None]:
def get_top_grams(corpus, ngram_range=(1,1), nbwords=10):
    vect = CountVectorizer(ngram_range=ngram_range)
    counts = vect.fit_transform(corpus).toarray().sum(axis=0)
    
    # Sort the index of the nbwords most frequent words (note : argsort() is an ascending sort)
    argsort_descending = counts.argsort()[::-1][:nbwords]
    names = np.array(vect.get_feature_names())[argsort_descending]
    counts_sorted = counts[argsort_descending]
    return names, counts_sorted

def plot_bar(names, counts_sorted, color_groups, title):
    fig = go.Figure(go.Bar(x=counts_sorted, 
                           y=names,
                           orientation='h',
                           marker_color = ['#097394']*color_groups[0] + ['#52B1CE']*color_groups[1] + ['#A8E0F2']*color_groups[2]),
                   )
    
    fig.update_traces(marker_line_color = '#077092', opacity=0.8)
    fig['layout']['yaxis']['autorange'] = "reversed"
    
    fig.update_layout(title=title)
    fig.show()

### Unigrams

In [None]:
nb_words = 15
names, counts = get_top_grams(df["tweet_clean"], ngram_range=(1,1), nbwords=nb_words)
plot_bar(names, counts, (1,4,nb_words), f"Top {nb_words} <b>Unigram<b>")

### Bigrams

In [None]:
nb_words = 15
names, counts = get_top_grams(df["tweet_clean"], ngram_range=(2,2), nbwords=nb_words)
plot_bar(names, counts, (1,5,nb_words), f"Top {nb_words} <b>Bigrams<b>")

### Trigrams

In [None]:
nb_words = 15
names, counts = get_top_grams(df["tweet_clean"], ngram_range=(3,3), nbwords=nb_words)
plot_bar(names, counts, (2,4,nb_words), f"Top {nb_words} <b>Trigrams<b>")

<a id='wordcloud'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   WordCloud 
</h2>

In [None]:
# https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html

mask = np.array(Image.open('../input/wordcloudmasks/mask.jpg'))

wordcloud = WordCloud(background_color='white',
                      max_words=100,
                      mask=mask)

wordcloud.generate(' '.join(text for text in df['tweet_clean']))

plt.figure(figsize=(20,10))
plt.title("Top words", fontdict={'size': 20})
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  
plt.show()

[Back to table of contents](#table_contents)

<a id='clustering'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
   Clustering
</h1>

In [None]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Unigrams + Bigrams
vectors = vectorizer.fit_transform(df['tweet_clean'])

print("Number of texts :",vectors.shape[0],"\nNumber of words (Unigrams + Bigrams) :", vectors.shape[1])
print("Number of non-zero entries :",vectors.nnz)
print("Sparsity measurement :",int(vectors.nnz/float(vectors.shape[0])),"active words per text out of", vectors.shape[1],"!")

# To find the words
print('\n',[(i,vectorizer.get_feature_names()[i]) for i in np.random.randint(vectors.shape[1], size=5)])

<a id='kmeans'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   KMeans
</h2>

In [None]:
n_clusters = 3
n_words = 6
seed = 0

kmeans = KMeans(n_clusters=n_clusters, random_state=seed, max_iter=10)
kmeans.fit(vectors)

# Find the cluster number for each tweet
labels = {k:[] for k in range (n_clusters)}
for i, label in enumerate(kmeans.labels_):
    labels[label].append(i)

# Join all texts of the same cluster
text_clusters = {k:"" for k in range (n_clusters)}
for k in range(n_clusters):
    text_clusters[k] = ' '.join(df['tweet_clean'].iloc[labels[k]])
    
# Most common words for each cluster
for k, text in text_clusters.items():
    print(f"{n_words} most common words for cluster {k}:  {Counter(text.split()).most_common(n_words)}")
print()
    
# WordClouds
fig, axs = plt.subplots(2, 2, figsize=(15,8))
fig.suptitle("WordClouds", fontweight='bold', fontsize=18)
for i in range(n_clusters):
    wordcloud = WordCloud(background_color='white', max_words=100).generate(text_clusters[i])
    axs[i//2][i%2].set_title(f"Cluster {i}", fontdict = {'fontsize':15, 'fontweight':'bold'})
    axs[i//2][i%2].imshow(wordcloud, interpolation='bilinear')
    axs[i//2][i%2].axis("off")     
fig.delaxes(axs[1][1])
plt.show()

<a id='lsa'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Latent semantic analysis (LSA)
</h2>

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html

n_clusters = 3
n_words = 6
seed = 0

svd = TruncatedSVD(n_components=n_clusters, n_iter=10, random_state=seed)
reduc_svd = svd.fit_transform(vectors)

labels = np.argmax(reduc_svd, axis=1)

# Find the cluster number for each tweet
labels = {k:[] for k in range (n_clusters)}
for i, label in enumerate(labels):
    labels[label].append(i)

# Join all texts of the same cluster
text_clusters = {k:"" for k in range (n_clusters)}
for k in range(n_clusters):
    text_clusters[k] = ' '.join(df['tweet_clean'].iloc[labels[k]])
    
# Most common words for each cluster
for k, text in text_clusters.items():
    print(f"{n_words} most common words for cluster {k}:  {Counter(text.split()).most_common(n_words)}")
print()   

# WordClouds
fig, axs = plt.subplots(2, 2, figsize=(15,8))
fig.suptitle("WordClouds", fontweight='bold', fontsize=18)
for i in range(n_clusters):
    wordcloud = WordCloud(background_color='white', max_words=100).generate(text_clusters[i])
    axs[i//2][i%2].set_title(f"Cluster {i}", fontdict = {'fontsize':15, 'fontweight':'bold'})
    axs[i//2][i%2].imshow(wordcloud, interpolation='bilinear')
    axs[i//2][i%2].axis("off")     
fig.delaxes(axs[1][1])
plt.show()

<a id='lda'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Latent Dirichlet Allocation (LDA)
</h2>

### **[Latent Dirichlet Allocation : sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)**

In [None]:
n_clusters = 3
n_words = 6
seed = 0

lda = LatentDirichletAllocation(n_components=n_clusters, max_iter=10, random_state=seed)
reduc_lda = lda.fit_transform(vectors)

labels = np.argmax(reduc_lda, axis=1)

# Find the cluster number for each tweet
labels = {k:[] for k in range (n_clusters)}
for i, label in enumerate(labels):
    labels[label].append(i)

# Join all texts of the same cluster
text_clusters = {k:"" for k in range (n_clusters)}
for k in range(n_clusters):
    text_clusters[k] = ' '.join(df['tweet_clean'].iloc[labels[k]])
    
# Most common words for each cluster
for k, text in text_clusters.items():
    print(f"{n_words} most common words for cluster {k}:  {Counter(text.split()).most_common(n_words)}")
print()   

# WordClouds
fig, axs = plt.subplots(2, 2, figsize=(15,8))
fig.suptitle("WordClouds", fontweight='bold', fontsize=18)
for i in range(n_clusters):
    wordcloud = WordCloud(background_color='white', max_words=100).generate(text_clusters[i])
    axs[i//2][i%2].set_title(f"Cluster {i}", fontdict = {'fontsize':15, 'fontweight':'bold'})
    axs[i//2][i%2].imshow(wordcloud, interpolation='bilinear')
    axs[i//2][i%2].axis("off")     
fig.delaxes(axs[1][1])
plt.show()

### **[Latent Dirichlet Allocation - Gensim](https://radimrehurek.com/gensim/models/ldamodel.html)**

> Step-by-step explanation : https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

* **gensim.utils.simple_preprocess(doc, deacc=False, min_len=2, max_len=15)**
> Convert a document into a **list of tokens**.<br/>
> This lowercases, tokenizes, de-accents (optional).

* **gensim.corpora.Dictionary().doc2bow(document, allow_update=False, return_missing=False)**
> Convert document into the **bag-of-words (BoW)** format = list of (token_id, token_count) tuples.

In [None]:
def tokenization(texts):
    for text in texts:
        yield(simple_preprocess(str(text)))  # deacc=True removes punctuations

# List of tokenized tweets
data_words = list(tokenization(df['tweet_clean']))
print("Tokenization of the first tweet :",data_words[0])

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
print("and his Term Document Frequency :",corpus[0])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

* **print_topics(num_topics=20, num_words=10)**
> Get the most significant topics (alias for show_topics() method)

In [None]:
seed = 0
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=seed,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the most significant topics
pprint(f"The most significant topics :\n{lda_model.print_topics()}")
doc_lda = lda_model[corpus]

In [None]:
# WordClouds
topics = lda_model.show_topics(formatted=False)

fig, axs = plt.subplots(2, 2, figsize=(15,8))
fig.suptitle("WordClouds", fontweight='bold', fontsize=18)
for i in range(n_clusters):
    wordcloud = WordCloud(background_color='white', max_words=100).generate_from_frequencies(dict(topics[i][1]))
    axs[i//2][i%2].set_title(f"Cluster {i}", fontdict = {'fontsize':15, 'fontweight':'bold'})
    axs[i//2][i%2].imshow(wordcloud, interpolation='bilinear')
    axs[i//2][i%2].axis("off")     
fig.delaxes(axs[1][1])
plt.show()

* **log_perplexity(chunk, total_docs=None)**
> Calculate and return per-word likelihood bound, using a chunk of documents as evaluation corpus.

* **[CoherenceModel()](https://radimrehurek.com/gensim/models/coherencemodel.html).get_coherence()**
> Get coherence value based on pipeline parameters.

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

* **[pyLDAvis.gensim_models](https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/gensim_models.py).prepare(topic_model, corpus, dictionary, doc_topic_dist=None, **kwargs)**
> Transforms the Gensim TopicModel and related corpus and dictionary into the data structures needed for the visualization.

In [None]:
# Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word) # sort_topics=False, to not reorder by size
# vis

[Back to table of contents](#table_contents)

<a id='vectorization'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
   Vectorization
</h1>

There are different ways to vectorize a text, for example, by frequency or by prediction :

* **by frequency :**
    * **CountVectorizer (scikit-learn) :**
        * counts the number of times the words appear
        * favors the most frequent words
        * ignores a bit the rare words that could have been interesting<br/><br/>
    * **TfidfVectorizer (scikit-learn) :**
        * considers the global weighting of words
        * penalizes the most frequent words
        * does not understand the semantic meaning of words <br/><br/>
* **by prediction :**
    * Solves the problem of the **semantic meaning** of words
    * Uses **embeddings** to vectorize texts before classification<br/>
    * **Continuous Bag of Word (CBOW) :**
        * predicts a **word given its context**<br/>
    * **Skip-Gram (SG) :**
        * predicts a **context given a word**



<a id='countvect'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Count Vectorizer
</h2>

**[Count vectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)** counts the number of times the words appear.

In [None]:
%%time

# Pipelines
pip_uni = Pipeline([ ('countUni', CountVectorizer(ngram_range=(1, 1))), ('clf', LinearSVC()) ])
pip_bi = Pipeline([ ('countBi', CountVectorizer(ngram_range=(2, 2))), ('clf', LinearSVC()) ])
pip_tri = Pipeline([ ('countTri', CountVectorizer(ngram_range=(3, 3))), ('clf', LinearSVC()) ])
pip = Pipeline([ ('countUniBi', CountVectorizer(ngram_range=(1, 2))), ('clf', LinearSVC()) ])

# Lists
names = ["Unigrams","Bigrams","Trigrams","Uni+Bigrams"]
pipelines = [pip_uni, pip_bi, pip_tri, pip]
res = []

# Results
for i, p in enumerate(pipelines) :
    val_score = cross_val_score(p, df['tweet_clean'], df["encoded_target"]).mean()
    res.append({'n_grams':names[i],'Score':val_score})

results = pd.DataFrame(res).sort_values(by='Score', ascending=False)
results.style.background_gradient("Blues")

We can see that we get the best score by taking into account **Unigrams + Bigrams**.<br/>
Taking just the unigrams is also good.

<a id='tfidfvect'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   TF-IDF Vectorizer
</h2>

**[Tf-idf vectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)** considers the global weighting of words.

In [None]:
%%time

# Pipelines
pip_uni = Pipeline([ ('tfidfUni', TfidfVectorizer(ngram_range=(1, 1))), ('clf', LinearSVC()) ])
pip_bi = Pipeline([ ('tfidfBi', TfidfVectorizer(ngram_range=(2, 2))), ('clf', LinearSVC()) ])
pip_bi = Pipeline([ ('tfidfTri', TfidfVectorizer(ngram_range=(3, 3))), ('clf', LinearSVC()) ])
pip = Pipeline([ ('tfidfUniBi', TfidfVectorizer(ngram_range=(1, 2))), ('clf', LinearSVC()) ])

# Lists
names = ["Unigrams","Bigrams","Trigrams","Uni+Bigrams"]
pipelines = [pip_uni, pip_bi, pip_tri, pip]
res = []

# Results

for i, p in enumerate(pipelines) :
    val_score = cross_val_score(p, df['tweet_clean'], df["encoded_target"]).mean()
    res.append({'n_grams':names[i],'Score':val_score})

results = pd.DataFrame(res).sort_values(by='Score', ascending=False)
results.style.background_gradient("Blues")

As before, **unigrams** and **unigrams+bigrams** are the most efficient.

<a id='cbow'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Continuous Bag of Word (CBOW)
</h2>

> **Word2Vec** is composed of two distinct language models (CBOW and SG), optimized to quickly learn word vectors.<br/>
> **[Gensim](https://radimrehurek.com/gensim/models/word2vec.html)** has one of Word2Vec fastest implementation.

**CBOW** predicts a word given its context.

In [None]:
texts = [text.split() for text in df["tweet_clean"]]

# the following configuration is the default configuration
w2v_cbow = gensim.models.word2vec.Word2Vec(sentences=texts,
                                        vector_size=100, window=5,     ### here we train a cbow model 
                                        min_count=5,                      
                                        sample=0.001, workers=3,
                                        sg=0, hs=0, negative=5,        ### set sg to 1 to train a sg model
                                        cbow_mean=1,
                                        epochs=5)

# print(w2v_cbow.wv.key_to_index) # get vocabulary

### Most similars words

* **most_similar(positive=None, negative=None, topn=10, ...)**
> Find the top-N most similar keys. Positive keys contribute positively towards the similarity, negative keys negatively.<br/>
> This method computes **cosine similarity** between a simple mean of the projection weight vectors of the given keys and the vectors for each key in the model.<br/>
The most famous exemple is: `vec(king) - vec(man) + vec(woman) => vec(queen)`.

In [None]:
words = ['covid', 'market', 'famili']
n = 5
for word in words :
    print(f"Top {n} similar words with {word} :\n{w2v_cbow.wv.most_similar(word, topn=n)}\n\n")

In [None]:
print(w2v_cbow.wv.most_similar(positive=['covid','futur'], negative=['increas'], topn=n))

### Predict output word

* **predict_output_word(context_words_list, topn=10)**
> Get the probability distribution of the center word given context words.<br/>
> **Note :** this performs a CBOW-style propagation, even in SG models, and doesn’t quite weight the surrounding words the same as in training – so it’s just one crude way of using a trained model as a predictor.

In [None]:
print(w2v_cbow.predict_output_word(['impact'],topn=n))

<a id='sg'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Skip-Gram (SG)
</h2>

**SG** predicts a context given a word.

In [None]:
texts = [text.split() for text in df["tweet_clean"]]

w2v_sg = gensim.models.word2vec.Word2Vec(sentences=texts,
                                vector_size=100, window=5,               ### here we train a sg model 
                                min_count=5,                      
                                sample=0.001, workers=3,
                                sg=1, hs=0, negative=5,        ### set sg to 1 to train a sg model
                                cbow_mean=1,
                                epochs=5)

# print(w2v_sg.wv.key_to_index) # get vocabulary

### Most similar words

* **most_similar(positive=None, negative=None, topn=10, ...)**
> Find the top-N most similar keys. Positive keys contribute positively towards the similarity, negative keys negatively.<br/>
> This method computes **cosine similarity** between a simple mean of the projection weight vectors of the given keys and the vectors for each key in the model.<br/>
The most famous exemple is: `vec(king) - vec(man) + vec(woman) => vec(queen)`

In [None]:
words = ['covid', 'market', 'famili']
n = 5
for word in words :
    print(f"Top {n} similar words with {word} :\n{w2v_sg.wv.most_similar(word, topn=n)}\n\n")

In [None]:
print(w2v_sg.wv.most_similar(positive=['covid','futur'], negative=['increas'], topn=n))

### Predict output word

* **predict_output_word(context_words_list, topn=10)**
> Get the probability distribution of the center word given context words.<br/>
> **Note :** this performs a CBOW-style propagation, even in SG models, and doesn’t quite weight the surrounding words the same as in training – so it’s just one crude way of using a trained model as a predictor.

In [None]:
print(w2v_sg.predict_output_word(['impact'],topn=n))

<a id='learn_embeddings'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Testing the learned embeddings
</h2>

Is **great** really closer to **good** than to **bad** ?

In [None]:
print("Modèle CBOW, great and good:",w2v_cbow.wv.similarity("great","good"))
print("Modèle CBOW, great and bad:",w2v_cbow.wv.similarity("great","bad"))
print("Modèle SG, great and good:",w2v_sg.wv.similarity("great","good"))
print("Modèle SG, great and bad:",w2v_sg.wv.similarity("great","bad"))

### Sentiment classification

Since we have only **word vectors** and that **sentences are made of multiple words**, we need to **aggregate** them.

In [None]:
def vectorize(text, model, mean=False):
    """ This function should vectorize one tweet."""
    text = text.split()
    vec = np.zeros(model.vector_size)
    cpt = 0
    for word in text:
        if word in model.wv.key_to_index:
            vec += model.wv.get_vector(word)
            cpt += 1
            
    if (mean == True) and (cpt != 0):
        return vec / cpt

    return vec

def evaluation(models, models_name, X_train, X_test, y_train, y_test):
    res = []
    svc = LinearSVC(dual=False)
    for mi in range(len(models)):
        x_train = [vectorize(text, models[mi]) for text in X_train]
        x_test = [vectorize(text, models[mi]) for text in X_test]
        svc.fit(x_train, y_train)
        ypred = svc.predict(x_test)
        res.append({'Model':models_name[mi],'Aggregation':'sum','Score':accuracy_score(y_test, ypred)})
        
        X_mean = [vectorize(text, models[mi], True) for text in X_train]
        X_test_mean = [vectorize(text, models[mi], True) for text in X_test]
        svc.fit(X_mean, y_train)
        ypred = svc.predict(X_test_mean)
        res.append({'Model':models_name[mi],'Aggregation':'mean','Score':accuracy_score(y_test, ypred)})
        
    return res 

# Let's see what a tweet vector looks like.
print(vectorize(X_train.iloc[0], w2v_sg))

In [None]:
%%time
models = [w2v_cbow, w2v_sg]
names = ["w2v_cbow", "w2v_sg"]
res = evaluation(models, names, X_train, X_test, y_train, y_test)
pd.DataFrame(res).sort_values(by='Score', ascending=False).style.background_gradient("Blues")

[Back to table of contents](#table_contents)

<a id='modeling'></a>
<h1 style="text-align:center; background-color:#4DBDE3; color:white; border-radius: 50px 15px">
   Modeling
</h1>

<a id='class_rebalancing'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Class rebalancing
</h2>

> **[Distribution of classes](#target_process) : { 0 : 15398 , 1 : 7713 , 2 : 18046 }**

Since our **classes are not balanced**, we have to try to remedy this so that the models can learn properly.<br/>

___

To deal with imbalanced datasets, we can do **undersampling** or **oversampling** techniques, as :

* **[Undersampling](https://imbalanced-learn.org/stable/references/under_sampling.html) :** *RandomUnderSampler, NearMiss, EditedNearestNeighbours, AllKNN, TomekLinks, ...*
* **[Oversampling](https://imbalanced-learn.org/stable/references/over_sampling.html) :** *RandomOverSampler, SMOTE, ADASYN, SMOTENC, BorderlineSMOTE, ...*
* **[Combine under and over sampling](https://imbalanced-learn.org/stable/references/combine.html) :** *SMOTEENN, SMOTETomek*
> *Useful article : https://medium.com/analytics-vidhya/re-sampling-imbalanced-training-corpus-for-sentiment-analysis-c9dc97f9eae1*

* For **data augmentation in NLP**, there are also : *Synonym Replacement, Random Deletion, Random Swap, Random Insertion, ...*
> *You can take a look at this notebook : https://www.kaggle.com/swarajshinde/eda-data-augmentation-techniques-for-text-nlp*

---

So, we will try to see which techniques will work best for our case with LinearSVC.

In [None]:
%%time

# Evaluation function
def eval_model(clf, names, list_xtrain, list_ytrain, x_test, y_test) :
    """ Returns a sorted DataFrame with an f1-score for each (classifier - resampling technique). """
    results = []
    for name, x, y in zip(names, list_xtrain, list_ytrain) :
        start = time.time()
        clf.fit(x, y)
        pred = clf.predict(x_test)
        results.append({'Name': name, 'F1-score':f1_score(y_test, pred, average='micro')})
        print(f"{print_color.BOLD}{name}{print_color.END} execution time : {print_color.DARKCYAN}{time.time() - start} seconds{print_color.END}")
    return pd.DataFrame(results).sort_values(by='F1-score', ascending=False)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train, y_train)
X_test_tfidf = vectorizer.transform(X_test)

# List of re-sampling techniques
names = ["RandUnderSampler", "NearMiss", "AllKNN", "RandOverSampler", "BorderSMOTE", "SMOTE", "ASASYN", "SMOTEENN", "SMOTETomek"]
under_strategy = {0: 7713, 1: 7713, 2: 7713}
over_strategy = {0: 18046, 1: 18046, 2: 18046}
seed = 142

techniques = [RandomUnderSampler(sampling_strategy=under_strategy, random_state=seed), NearMiss(version=3), AllKNN(), #Undersampling
             RandomOverSampler(sampling_strategy=over_strategy, random_state=seed), BorderlineSMOTE(random_state=seed), #Oversampling
             SMOTE(random_state=seed), ADASYN(sampling_strategy='minority', random_state=seed), #Oversampling
             SMOTEENN(random_state=seed), SMOTETomek(random_state=seed)] #Combined techniques

# Re-sampling
list_xtrain = []
list_ytrain = []

for i, tech in enumerate(techniques):
    start = time.time()
    x, y = tech.fit_resample(X_train_tfidf, y_train)
    list_xtrain.append(x)
    list_ytrain.append(y)
    print(f"{print_color.BOLD}{names[i]}{print_color.END} execution time : {print_color.GREEN}{time.time() - start} seconds{print_color.END}")

**Let's add some others data :**
* **Original** (not balanced data)
* **Undersampling :** Random undersampling of **majority class to middle class**
* **Under+SMOTE :** Random undersampling of **majority class to middle class** + Oversampling (*BorderlineSMOTE*) of **minority class to middle class**

In [None]:
# Random Undersampling
sampling_strategy = {0: 15398, 1: 7713, 2: 15398}
undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_under, y_under = undersample.fit_resample(X_train_tfidf, y_train)

# Borderline SMOTE (Oversampling)
sm = BorderlineSMOTE()
X_under_sm, y_under_sm = sm.fit_resample(X_under, y_under)

# More data
names += ["Original", "Undersampling", "Under+SMOTE"]
list_xtrain += [X_train_tfidf, X_under, X_under_sm]
list_ytrain += [y_train, y_under, y_under_sm]         

#### **Results (F1-score)**

In [None]:
%%time
linSVC = LinearSVC(random_state=seed)
res = eval_model(linSVC, names, list_xtrain, list_ytrain , X_test_tfidf, y_test)
res.style.background_gradient("Greens")

In [None]:
# Best data
xtrain_best = list_xtrain[res.index[0]]
ytrain_best = list_ytrain[res.index[0]]
print(names[res.index[0]])

The data augmentation with **RandomOverSampler** looks good.

<a id='xgboost'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   XGBoost
</h2>

In [None]:
%%time

xgboost = xgb.XGBClassifier(num_class=3, learning_rate=0.1, max_depth=10,
                            use_label_encoder=False, eval_metric='mlogloss')

xgboost.fit(xtrain_best, ytrain_best)

# Predictions
y_pred = xgboost.predict(X_test_tfidf)

# Plot confusion matrix
plot_confusion_matrix(xgboost, X_test_tfidf, y_test)

# Print Classification report
print(f"\nClassification Report :\n{classification_report(y_test, y_pred)}")

<a id='linsvc'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   Linear Support Vector Classification
</h2>

In [None]:
%%time

linSVC = LinearSVC(random_state=seed)

# Fit the pipeline
linSVC.fit(xtrain_best, ytrain_best)

# Predictions
y_pred = linSVC.predict(X_test_tfidf)

# Plot confusion matrix
plot_confusion_matrix(linSVC, X_test_tfidf, y_test)

# Print Classification report
print(f"\nClassification Report :\n{classification_report(y_test, y_pred)}")

<a id='lstm'></a>
<h2 style="text-align:center; background-color:#3CD8A5; color:white; border-radius: 50px 15px">
   LSTM
</h2>

Thanks to **@AndresHG** for the following code.

* **[tf.keras.preprocessing.text.Tokenizer](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer)**
> Text tokenization utility class.

* **[tf.keras.preprocessing.sequence.pad_sequences](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences)**
> Pads sequences to the same length.

In [None]:
texts = df["tweet_clean"]
target = df["encoded_target"]
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(texts)
sequences=tokenizer_obj.texts_to_sequences(texts)

tweet_pad = pad_sequences(sequences,
                          truncating='post',
                          padding='post')

vocab_length = len(tokenizer_obj.word_index) + 1
longest_train = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    tweet_pad, 
    target, 
    test_size=0.25
)

In [None]:
embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
with open('../input/glove6b100dtxt/glove.6B.100d.txt') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
        
embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in tokenizer_obj.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

In [None]:
# Model from https://www.kaggle.com/mariapushkareva/nlp-disaster-tweets-with-glove-and-lstm/data

def glove_lstm():
    model = Sequential()
    
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence
    ))
    
    model.add(Bidirectional(LSTM(
        length_long_sentence, 
        return_sequences = True, 
        recurrent_dropout=0.2
    )))
    
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation = "softmax"))
    
    loss = SparseCategoricalCrossentropy()
    optimizer = Adam(learning_rate = 1e-3)

    model.compile(optimizer=optimizer,loss=loss,metrics=['accuracy'])
    
    return model

model = glove_lstm()
model.summary()

In [None]:
# Load the model and train!!

model = glove_lstm()

checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor = 'val_loss', 
    verbose = 1, 
    save_best_only = True
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)
history = model.fit(
    X_train, 
    y_train, 
    epochs = 10,
    batch_size = 32,
    validation_data = (X_test, y_test),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint]
)


In [None]:
fig, ax = plt.subplots(1,2,figsize=(15,5))
ax[0].set_title("Loss")
ax[0].plot(history.history['loss'], label="Training loss")
ax[0].plot(history.history['val_loss'], label="validation loss")
ax[0].legend(loc='best')

ax[1].set_title("Accuracy")
ax[1].plot(history.history['accuracy'], label="Training accuracy")
ax[1].plot(history.history['val_accuracy'],label="Validation accuracy")
ax[1].legend(loc='best')
plt.legend()
plt.show()

In [None]:
pred = np.argmax(model.predict(X_test), axis = 1)
print(f"\nClassification Report :\n{classification_report(y_test, pred)}")
sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='d', cmap="YlGnBu")
plt.show()