## Importing Libs, Loading Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
from google.colab import files
#files.upload()

In [None]:
df = pd.read_csv("labeled_data.csv")

## Preprocessing

In [None]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...


In [None]:
df.drop(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'], inplace=True, axis=1)

In [None]:
text_length = []
for i in range(len(df)):
    text_length.append(len(df['tweet'][i]))
df['text length'] = text_length

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import string
#nltk.download('stopwords')
#nltk.download('punkt')


In [None]:
def normalize_opinion(text):
    # import the english stop words list from NLTK
    stop_words = set(stopwords.words('english'))
    
    # Instantiate stemming class
    stemmer = PorterStemmer() 
    
    filtered_text = []

    # Tokenization, lowercasing, removing stop words and punctuation and stemming
    filtered_text = [stemmer.stem(w.lower()) for w in nltk.word_tokenize(text) if 
                         w not in string.punctuation and
                         w.lower() not in stop_words]   
    
    # Return the the list of tokens converted into a string    
    return  ' '.join(filtered_text)

In [None]:
df['tweet2'] = df['tweet']  ## Keeping original tweet encase needed


In [None]:
df['tweet'] = df['tweet'].apply(lambda x: normalize_opinion(x)) # Clean using nltk stopwords, tokenizing, lowercasing, punctuation

## might be tokenizing twice ?

In [None]:
df.head(1)

Unnamed: 0,class,tweet,text length,tweet2
0,2,rt mayasolov woman n't complain clean hous amp...,140,!!! RT @mayasolovely: As a woman you shouldn't...


##  Getting Ready to Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['class'], random_state=42, test_size=0.3)

In [None]:
X_train.head()

8426     charli jacob got confus ice bucket challeng in...
3698     juanndacut sharea40ounc playin pussi hole bing...
10054    tell mcgirt music ai n't enough.y got ta non m...
3451     ihatestevens ai n't show bout dem color diamon...
18673    rt blessedarti y'all claim want good amp loyal...
Name: tweet, dtype: object

In [None]:
len_vec = [len(elem) for elem in X_train] #[len(elem) for elem in x_test] + [len(elem) for elem in x_val] 
max_len = 200
num_words = 100000
from keras.preprocessing.text import Tokenizer
# Fit the tokenizer on the training data
t = Tokenizer(num_words=num_words,  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
t.fit_on_texts(X_train)

In [None]:
## CONVERT TRAIN INTO SEQUENCES FOR NN
X_train = t.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
print(X_train)

[[  98 3098   18 ...    0    0    0]
 [3099 4475  898 ...    0    0    0]
 [  75 8849  420 ...    0    0    0]
 ...
 [ 443  644  429 ...    0    0    0]
 [   2 8703  227 ...    0    0    0]
 [   1  105   21 ...    0    0    0]]


## Building Neural Net & Begin Training 

In [None]:
embedding_size = 16
n_classes = 3
epochs = 20

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(num_words, embedding_size, input_shape=(max_len,)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dropout(0.6))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 16)           1600000   
_________________________________________________________________
dense_12 (Dense)             (None, 200, 64)           1088      
_________________________________________________________________
dense_13 (Dense)             (None, 200, 16)           1040      
_________________________________________________________________
flatten_4 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 3200)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 9603      
Total params: 1,611,731
Trainable params: 1,611,731
Non-trainable params: 0
____________________________________________

In [None]:
model.fit(X_train, np.array(y_train), epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f00dbe96550>

In [None]:
X_test = t.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

## Evaluation

In [None]:
model.evaluate(X_test, np.array(y_test))



[1.0907082557678223, 0.8407531976699829]

In [None]:
##CLASSIFICATION REPORT##

from sklearn.metrics import classification_report, f1_score

y_pred = model.predict(X_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.38      0.16      0.22       427
           1       0.86      0.96      0.91      5747
           2       0.79      0.55      0.65      1261

    accuracy                           0.84      7435
   macro avg       0.68      0.55      0.59      7435
weighted avg       0.82      0.84      0.82      7435

