In [1]:
!pip install -q -U tensorflow-text
!pip install -q tf-models-official
!pip install tensorflow_hub



In [2]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import wordcloud
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from keras import backend as K
from transformers import AutoTokenizer,TFBertModel
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import CategoricalAccuracy, BinaryAccuracy
from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy


sns.set_style("whitegrid")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanke\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train = train.drop(['keyword','location'], axis = 1)
test = test.drop(['keyword','location'], axis = 1)
train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Checking Shape of Train and Test sets:
print("Shape of Train set:", train.shape)
print("Shape of Test set:", test.shape)

Shape of Train set: (7613, 3)
Shape of Test set: (3263, 2)


# Labels are as follows:
label '1' ---> racist/sexist tweet           
label '0' ---> not racist/sexist tweet

In [6]:
df = train.copy()
df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

# 1. Model without removing any feature:

### Splitting data into Train and Test sets

In [7]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

# BERT 
### Base Model with Neural Networks:

In [8]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/4")

In [9]:
# Checking array created using BERT:
def get_sentence_embedding(sentences):
  preprocessed_text = bert_preprocess(sentences)
  return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embedding(["You are noob.","What are you looking at?"])

<tf.Tensor: shape=(2, 1024), dtype=float32, numpy=
array([[ 0.99035466,  0.9815679 ,  0.9975458 , ..., -0.9994887 ,
        -0.5053125 ,  0.9433913 ],
       [ 0.9989152 ,  0.11918571,  0.8990445 , ..., -0.63001037,
        -0.9233927 ,  0.9344142 ]], dtype=float32)>

In [10]:
# Bert layers:
num_classes = 2
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers:
l = tf.keras.layers.Dropout(0.2, name='dropout')(outputs['pooled_output'])
l = tf.keras.layers.Dense(num_classes, activation='sigmoid', name='output')(l)

# Construct final model:
model = tf.keras.Model(inputs=[text_input], outputs=[l])

model.summary()

METRICS = [
           tf.keras.metrics.BinaryCrossentropy(name='accuracy'),
           tf.keras.metrics.Precision(name='precision'),
           tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=METRICS)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [11]:
#Ploting Model Architecture:
tf.keras.utils.plot_model(model)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


### Training model:

In [12]:
history = model.fit(X_train, y_train, epochs=4)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.5631595849990845, 0.610754668712616, 0.7018204927444458, 0.7340774536132812]

### Results:

In [13]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

Confusion Matrix
[[602 239]
 [187 495]]
              precision    recall  f1-score   support

           0       0.76      0.72      0.74       841
           1       0.67      0.73      0.70       682

    accuracy                           0.72      1523
   macro avg       0.72      0.72      0.72      1523
weighted avg       0.72      0.72      0.72      1523



# 2. Model after removing stopwords:

In [14]:
df = train.copy()

### Removing Stopwords:

In [15]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Splitting data into Train and Test sets

In [16]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [17]:
history = model.fit(X_train, y_train, epochs=3)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.5318173170089722,
 0.5857071876525879,
 0.7382592558860779,
 0.7741299867630005]

### Results:

In [18]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

Confusion Matrix
[[697 144]
 [230 452]]
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       841
           1       0.76      0.66      0.71       682

    accuracy                           0.75      1523
   macro avg       0.76      0.75      0.75      1523
weighted avg       0.75      0.75      0.75      1523



# 3. Model after removing repeating characters:

In [19]:
df = train.copy()

### Removing repeating characteres:

In [20]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

# Detokenizing tweets:

def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

df['text'] = df['text'].apply(lambda x: listToString(x))

### Splitting data into Train and Test sets

In [21]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [22]:
history = model.fit(X_train, y_train, epochs=3)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.5394353866577148,
 0.5913888812065125,
 0.7182741165161133,
 0.7432698607444763]

### Results:

In [23]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

Confusion Matrix
[[613 228]
 [179 503]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       841
           1       0.69      0.74      0.71       682

    accuracy                           0.73      1523
   macro avg       0.73      0.73      0.73      1523
weighted avg       0.74      0.73      0.73      1523



# 4. Model after removing Punctuations:

In [24]:
df = train.copy()

### Removing Punctuations:

In [25]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [26]:
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Splitting data into Train and Test sets

In [27]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [28]:
history = model.fit(X_train, y_train, epochs=3)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.5291005969047546,
 0.5861385464668274,
 0.7641445398330688,
 0.7360472679138184]

### Results:

In [29]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

Confusion Matrix
[[748  93]
 [260 422]]
              precision    recall  f1-score   support

           0       0.74      0.89      0.81       841
           1       0.82      0.62      0.71       682

    accuracy                           0.77      1523
   macro avg       0.78      0.75      0.76      1523
weighted avg       0.78      0.77      0.76      1523



# 5. Model after removing numbers:

In [30]:
df = train.copy()

### Removing numbers:

In [31]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Splitting data into Train and Test sets

In [32]:
y = tf.keras.utils.to_categorical(df['target'], num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=3)

### Training model:

In [33]:
history = model.fit(X_train, y_train, epochs=3)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.5323783159255981, 0.5822197794914246, 0.75, 0.770190417766571]

### Results:

In [34]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

Confusion Matrix
[[749  92]
 [272 410]]
              precision    recall  f1-score   support

           0       0.73      0.89      0.80       841
           1       0.82      0.60      0.69       682

    accuracy                           0.76      1523
   macro avg       0.78      0.75      0.75      1523
weighted avg       0.77      0.76      0.75      1523



# Setup 5: Applying Stemming and Lemmatization:

In [35]:
df = train.copy()

### Applying Stemming: 

In [36]:
# Tokenizing tweets:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test :

In [37]:
y = tf.keras.utils.to_categorical(df['target'].astype(str), num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), y, test_size=0.2, random_state=3)

### Training model:

In [38]:
history = model.fit(X_train, y_train, epochs=4)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.6003526449203491, 0.6659488081932068, 0.732594907283783, 0.3040052652359009]

# Results:

In [39]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

Confusion Matrix
[[683 158]
 [319 363]]
              precision    recall  f1-score   support

           0       0.68      0.81      0.74       841
           1       0.70      0.53      0.60       682

    accuracy                           0.69      1523
   macro avg       0.69      0.67      0.67      1523
weighted avg       0.69      0.69      0.68      1523



# 7. Models after removing all the features:

In [40]:
df = train.copy()

### Removing Punctuations:

In [41]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [42]:
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Removing Stopwords:

In [43]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Removing Numbers:

In [44]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Removing repeating characters:

In [45]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

# Detokenizing tweets:

def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

df['text'] = df['text'].apply(lambda x: listToString(x))

### Applying Stemming: 

In [46]:
# Tokenizing tweets:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test sets

In [47]:
y = tf.keras.utils.to_categorical(df['target'].astype(str), num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), y, test_size=0.2, random_state=3)

### Training model:

In [48]:
history = model.fit(X_train, y_train, epochs=4)

# Evaluating results with test set:
model.evaluate(X_test, y_test, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.6317766904830933,
 0.6799976229667664,
 0.7013177275657654,
 0.3145108222961426]

### Results:

In [49]:
y_test_arg = np.argmax(y_test, axis=1)
y_test_arg[1]
y_pred = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_arg, y_pred))
print(metrics.classification_report(y_test_arg, y_pred))

Confusion Matrix
[[393 448]
 [109 573]]
              precision    recall  f1-score   support

           0       0.78      0.47      0.59       841
           1       0.56      0.84      0.67       682

    accuracy                           0.63      1523
   macro avg       0.67      0.65      0.63      1523
weighted avg       0.68      0.63      0.62      1523

