# Importing Packages

In [1]:
#Data management
import pandas as pd
import numpy as np
np.random.seed(0)
from pandas_profiling import ProfileReport

#TextBlob Features
from textblob import TextBlob

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
#SciKit-Learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
#nltk
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')

#Test
from collections import Counter

In [3]:
#Tensorflow / Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer


In [4]:
tf.__version__

'2.5.0'

In [5]:
sns.set()

# Importing The Data

In [6]:
#Training Data
path = "data/twitter_training.csv"
trainDf = pd.read_csv(path)


In [7]:
#Test Data (Not to be used until the full model has been trained)
testPath = "data/twitter_validation.csv"
testDf = pd.read_csv(testPath)

# Data Exploration

In [8]:
trainDf.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [9]:
testDf.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


## Re-import the data

In [10]:
trainDf = pd.read_csv(path, names=["Tweet_ID", "Entity", "Sentiment", "Tweet_Content"])
testDf = pd.read_csv(testPath, names=["Tweet_ID", "Entity", "Sentiment", "Tweet_Content"])

In [11]:
trainDf.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [12]:
testDf.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [13]:
trainDf.shape, testDf.shape

((74682, 4), (1000, 4))

In [14]:
trainDf.describe()

Unnamed: 0,Tweet_ID
count,74682.0
mean,6432.586165
std,3740.42787
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [15]:
testDf.describe()

Unnamed: 0,Tweet_ID
count,1000.0
mean,6432.088
std,3728.310569
min,6.0
25%,3247.75
50%,6550.0
75%,9661.75
max,13197.0


## Missing Values

In [16]:
plt.figure(figsize=(20,6))
sns.heatmap(trainDf.isna(),
            cmap="magma",
            cbar_kws={'label': 'Missing Data'})

<AxesSubplot:>

In [17]:
trainDf.isna().sum()

Tweet_ID           0
Entity             0
Sentiment          0
Tweet_Content    686
dtype: int64

In [18]:
plt.figure(figsize=(20,6))
sns.heatmap(testDf.isna(),
            cmap="magma",
            cbar_kws={'label': 'Missing Data'})

<AxesSubplot:>

In [19]:
testDf.isna().sum()

Tweet_ID         0
Entity           0
Sentiment        0
Tweet_Content    0
dtype: int64

In [20]:
trainDf = trainDf.dropna()

## Profiling

In [21]:
# profile = ProfileReport(trainDf, title="Pandas Profiling Report", explorative=True)
# profile