In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## What will I do ?

So, i'm gonna try to vizualise tweets by location, date and mostly by sentiment.
This period is very hard and I think it's interesting and important to visualize 
people's feelings and moods over time. Finally, I'll try to create a model
who can classify these tweets.

## Import librairies 📚

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
plt.style.use('ggplot')

import cufflinks as cf
import plotly.express as px
import plotly.offline as py
from plotly.offline import plot
import plotly.graph_objects as go
import plotly.graph_objs as go

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.utils.multiclass import type_of_target

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

## Reading Data 📝

In [None]:
#Df
train_df = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv", encoding='latin1')
test_df = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv")
train_df.head()

In [None]:
train_df.info()

In [None]:
print("We have :", train_df.shape[0], "Tweets in the Train set")
print("We have :", test_df.shape[0], "Tweets in the Test set")

In [None]:
train_df = train_df.drop(["ScreenName", "UserName"], axis=1)
train_df.head()

In [None]:
test_df = test_df.drop(["ScreenName", "UserName"], axis=1)
test_df.head()

## Data Exploration 📊 | Feature Engineering 🏷️

In [None]:
wc = pd.read_csv("../input/world-cities/world-cities_csv.csv")
wc = wc.drop(["geonameid"], axis=1)
wc.head()

In [None]:
wc = pd.read_csv("../input/world-cities/world-cities_csv.csv")
wc.head()

wc_uae = wc[wc["country"] == "United Arab Emirates"]
wc_usa = wc[wc["country"] == "United States"]
wc_uk = wc[wc["country"] == "United Kingdom"]
wc_can = wc[wc["country"] == "Canada"]
wc_afr = wc[(wc["country"] == "South Africa") | (wc["country"] == "Central African Republic")]
wc_pak = wc[wc["country"] == "Pakistan"]
wc_ind = wc[wc["country"] == "India"]
wc_fra = wc[wc["country"] == "France"]
wc_ger = wc[wc["country"] == "Germany"]
wc_aus = wc[wc["country"] == "Australia"]
wc_chi = wc[wc["country"] == "China"]
wc_nig = wc[wc["country"] == "Nigeria"]
wc_spa = wc[wc["country"] == "Spain"]
wc_arg = wc[wc["country"] == "Argentina"]

In [None]:
def encoding_location(item):
    for i in wc_uae["name"]:
        if str(i) in str(item):
            return "UAE"
    for i in wc_uae["subcountry"]:
        if str(i) in str(item):
            return "UAE"
    for i in wc_usa["name"]:
        if str(i) in str(item):
            return "USA"
    for i in wc_usa["subcountry"]:
        if str(i) in str(item):
            return "USA"
    for i in wc_uk["name"]:
        if str(i) in str(item):
            return "UK"
    for i in wc_uk["subcountry"]:
        if str(i) in str(item):
            return "UK"
    for i in wc_can["name"]:
        if str(i) in str(item):
            return "CAN"
    for i in wc_can["subcountry"]:
        if str(i) in str(item):
            return "CAN"
    for i in wc_afr["name"]:
        if str(i) in str(item):
            return "AFR"
    for i in wc_afr["subcountry"]:
        if str(i) in str(item):
            return "AFR"
    for i in wc_ind["name"]:
        if str(i) in str(item):
            return "IND"
    for i in wc_ind["subcountry"]:
        if str(i) in str(item):
            return "IND"
    for i in wc_pak["name"]:
        if str(i) in str(item):
            return "PAK"
    for i in wc_pak["subcountry"]:
        if str(i) in str(item):
            return "PAK"
    for i in wc_fra["name"]:
        if str(i) in str(item):
            return "FRA"
    for i in wc_fra["subcountry"]:
        if str(i) in str(item):
            return "FRA"
    for i in wc_ger["name"]:
        if str(i) in str(item):
            return "GER"
    for i in wc_ger["subcountry"]:
        if str(i) in str(item):
            return "GER"
    for i in wc_aus["name"]:
        if str(i) in str(item):
            return "AUS"
    for i in wc_aus["subcountry"]:
        if str(i) in str(item):
            return "AUS"
    for i in wc_chi["name"]:
        if str(i) in str(item):
            return "CHI"
    for i in wc_chi["subcountry"]:
        if str(i) in str(item):
            return "CHI"
    for i in wc_nig["name"]:
        if str(i) in str(item):
            return "NIG"
    for i in wc_nig["subcountry"]:
        if str(i) in str(item):
            return "NIG"
    for i in wc_spa["name"]:
        if str(i) in str(item):
            return "SPA"
    for i in wc_spa["subcountry"]:
        if str(i) in str(item):
            return "SPA"
    for i in wc_arg["name"]:
        if str(i) in str(item):
            return "ARG"
    for i in wc_arg["subcountry"]:
        if str(i) in str(item):
            return "ARG"
    for i in wc["country"]:
        if str(i) not in str(item):
            return "Other"
    if "?" in str(item):
        return "Other"
    elif "World" in str(item) \
    or "Global" in str(item) \
    or "Everywhere" in str(item) \
    or "Earth" in str(item) \
    or "Planet " in str(item):
        return "GLO"
    elif "France" in str(item):
        return "FRA"
    elif "Australia" in str(item):
        return "AUS"
    elif "Canada" in str(item):
        return "CAN"
    else:
        return item
    
train_df["Location"] = train_df["Location"].apply(encoding_location)
test_df["Location"] = test_df["Location"].apply(encoding_location)

train_df["Location"].value_counts()[:50]

In [None]:
#Graph : Country by tweets
fig = px.bar(train_df["Location"].value_counts()[:10], orientation="v", color=train_df["Location"].value_counts()[:10], color_continuous_scale=px.colors.sequential.Plasma, 
             log_x=False, labels={'value':'Count', 
                                'index':'Country',
                                 'color':'None'
                                })

fig.update_layout(
    font_color="black",
    title_font_color="red",
    legend_title_font_color="green",
    title_text="Country by tweets"
)

fig.show()

In [None]:
#Graph : Sentiment by count
fig = px.bar(train_df["Sentiment"].value_counts(), orientation="v", color=train_df["Sentiment"].value_counts(), color_continuous_scale=px.colors.sequential.Plasma, 
             log_x=False, labels={'value':'Count', 
                                'index':'Sentiment',
                                 'color':'None'
                                })

fig.update_layout(
    font_color="black",
    title_font_color="red",
    legend_title_font_color="green",
    title_text="Sentiment by count"
)

fig.show()

In [None]:
def encoding_sentiment(item):
    if item == "Extremely Negative" \
    or item == "Negative":
        return 0
    elif item == "Neutral":
        return 1
    elif item == "Positive" \
    or item == "Extremely Positive":
        return 2
    
train_df["Sentiment"] = train_df["Sentiment"].apply(encoding_sentiment)
test_df["Sentiment"] = test_df["Sentiment"].apply(encoding_sentiment)

train_df["Sentiment"].value_counts()

In [None]:
import re

def hash_finder(text):
    line=re.findall(r'(?<=#)\w+',text)
    return " ".join(line)

train_df['hash'] = train_df['OriginalTweet'].apply(lambda x:hash_finder(x))

In [None]:
#Graph : Hashtags by count (without tweet withtout #)
fig = px.bar(train_df['hash'].value_counts()[1:20], orientation="v", color=train_df['hash'].value_counts()[1:20], color_continuous_scale=px.colors.sequential.Plasma, 
             log_y=True, labels={'value':'Count', 
                                'index':'Hashtags',
                                 'color':'None'
                                })

fig.update_layout(
    font_color="black",
    title_font_color="red",
    legend_title_font_color="green",
    title_text="Hashtags by count"
)

fig.show()

In [None]:
def mentions_finder(text):
    line=re.findall(r'(?<=@)\w+',text)
    return " ".join(line)
train_df['mention'] = train_df['OriginalTweet'].apply(lambda x:mentions_finder(x))

In [None]:
#Graph : Mentions by count (without tweet withtout #)
fig = px.bar(train_df['mention'].value_counts()[1:20], orientation="v", color=train_df['mention'].value_counts()[1:20], color_continuous_scale=px.colors.sequential.Plasma, 
             log_y=True, labels={'value':'Count', 
                                'index':'Mentions',
                                 'color':'None'
                                })

fig.update_layout(
    font_color="black",
    title_font_color="red",
    legend_title_font_color="green",
    title_text="Mentions by count"
)

fig.show()

## Cleaning tweets 🧹

In [None]:
def  clean_text(df, text_field):
    df["OriginalTweet"] = df["OriginalTweet"].str.lower()
    df["OriginalTweet"] = df["OriginalTweet"].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem)) 
    
    return df

train_df = clean_text(train_df, "tweet")
train_df = clean_text(test_df, "tweet")

In [None]:
train_df["OriginalTweet"].head()

## Most used words 👀

In [None]:
from collections import Counter

most_used_word_for_cloud = (" ".join(train_df["OriginalTweet"]))

most_used_word = Counter(" ".join(train_df["OriginalTweet"]).split()).most_common(100)
most_used_word_df = pd.DataFrame(most_used_word, columns=["Words", "Frequency"])

most_used_word_df.head(3)

In [None]:
#Graph : Words by count
fig = px.bar(x=most_used_word_df["Words"][:30], y=most_used_word_df["Frequency"][:30], orientation="v", color=most_used_word_df["Words"][:30], color_continuous_scale=px.colors.sequential.Plasma, 
             log_y=False, labels={'value':'Count', 
                                'index':'Words',
                                  'color':'Word',
                                  'x':'Words',
                                  'y':'Frequency'
                                })

fig.update_layout(
    font_color="black",
    title_font_color="red",
    legend_title_font_color="green",
    title_text="Words by count"
)

fig.show()

In [None]:
# Create and generate a word cloud image :
wordcloud = WordCloud(background_color='black',colormap="Blues", 
                        width=600,height=400).generate(most_used_word_for_cloud)

# Display the generated image :
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Correlation 🔄

In [None]:
#Correlation
plt.figure(figsize=(12, 8))
sns.heatmap(test_df.corr(), annot=True)

### Modelling 🟩

## Not finished ! Work in progress.