In [1]:
%pip install contractions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import contractions
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku

Collecting contractions
  Downloading contractions-0.0.58-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
     |████████████████████████████████| 321 kB 902 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
[?25hCollecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
     |████████████████████████████████| 284 kB 13.5 MB/s            
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l- \ | / done
[?25h  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=106887 sha256=de3b434ea32db23cf2f92e12007d9858124e0a67b1ad86cf581a549044f92055
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahoc

In [2]:
train = pd.read_csv("../input/twitter-sentiment-analysis-hatred-speech/train.csv")
display(train.info())
print(train["tweet"][22])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


None

product of the day: happy man #wine tool  who's   it's the #weekend? time to open up &amp; drink up!


In [3]:
train.drop("id", axis=1, inplace = True)
train.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


# Preprocessing the data

In [4]:
def expand_contractions(df_series):
    
    for i in range(len(df_series)):
        
        df_series[i]= contractions.fix(df_series[i])
        
    return df_series

def get_pos(token):
    
    pos_tag = nltk.pos_tag(token)[0][1][0].upper()
    pos_tag_dict = {"J": wordnet.ADJ,
                    "V": wordnet.VERB,
                    "N": wordnet.NOUN,
                    "R": wordnet.ADV}
    return pos_tag_dict.get(pos_tag, wordnet.NOUN)

def lemmatize_series(df_series, remove_stopwords=False):
    
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        lm = WordNetLemmatizer()
        for i in range(len(df_series)):
            df_series[i] = ' '.join([lm.lemmatize(word, get_pos(word)) 
                                     for word in df_series[i].split() 
                                      if not word.lower() in stop_words])
    
    else:
        lm = WordNetLemmatizer()
        for i in range(len(df_series)):
            df_series[i] = ' '.join([lm.lemmatize(word, get_pos(word)) 
                                     for word in df_series[i].split()])
    
    return df_series



In [5]:
def preprocess_tweets(df_series, remove_stopwords=True):
    
    # Expand contractions (Eg: can't --> cannot)
    df_series = expand_contractions(df_series)
    
    # Removes '@user' tags
    df_series = df_series.str.replace("@user", "", regex=False)
    
    # Removes '&amp' tags
    df_series = df_series.str.replace("&amp", " ")
    
    # Removes non alphanumeric characters
    df_series = df_series.str.replace("[^a-zA-Z0-9 ]", " ")
    
    # Lemmatize tweets
    df_series = lemmatize_series(df_series, remove_stopwords=remove_stopwords)
    
    return df_series

In [6]:
train["tweet"] = preprocess_tweets(train["tweet"])
display(train.head())
print(train["tweet"][22])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
  del sys.path[0]


Unnamed: 0,label,tweet
0,0,father dysfunctional selfish drag kid dysfunct...
1,0,thanks lyft credit cannot use offer wheelchair...
2,0,bihday majesty
3,0,model love take time ur
4,0,factsguide society motivation


product day happy man wine tool weekend time open drink


In [7]:
## Creating Corpus for tokenization
corpus = []
for tweet in train['tweet']:
    corpus.append(tweet)       

In [8]:
# Creating unique key value pair with tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1



In [9]:
# Preparing the data for training by padding the sequence uniformly

max_seq_len = max([len(i) for i in corpus])
input_sequences = []
for tweet in corpus:
    token_list = tokenizer.texts_to_sequences([tweet])[0] # converting words to corresponding values from 'word_index'
    input_sequences.append(token_list)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')) # padding

In [10]:
print(input_sequences)

[[    0     0     0 ...   123  7110   295]
 [    0     0     0 ...  7112 13996  9094]
 [    0     0     0 ...     0    17  3027]
 ...
 [    0     0     0 ...  6870    24    65]
 [    0     0     0 ...  1449  1450   459]
 [    0     0     0 ...     0   103    86]]


In [11]:
labels = np.array(train['label'])
print(len(labels))

31962


# Train and Test Split

In [12]:
# Splitting the data into training and validation

split = int(.2 * len(corpus)) # 20% for validation
rand_row_num = np.random.randint(0, len(corpus), split)

X_test = np.array([input_sequences[i] for i in rand_row_num])
y_test = np.array([labels[i] for i in rand_row_num])

X_train = np.delete(input_sequences, rand_row_num, axis=0)
y_train = np.delete(labels, rand_row_num, axis=0)

In [13]:
y_train

array([0, 0, 0, ..., 0, 1, 0])

In [14]:
y_train = ku.to_categorical(y_train, num_classes=2)
y_test = ku.to_categorical(y_test, num_classes=2)

In [15]:
print(y_train)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


# Model 

In [16]:
# Model and Arcitecture

model = tf.keras.Sequential([
    layers.Embedding(total_words, 5, input_length = max_seq_len),
    layers.Bidirectional(layers.LSTM(8, return_sequences= True)),
    layers.Dropout(0.2),
    layers.LSTM(8),
    layers.Dense(8, activation = "relu"),
    layers.Dense(2, activation = "softmax")])


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 125, 5)            183555    
_________________________________________________________________
bidirectional (Bidirectional (None, 125, 16)           896       
_________________________________________________________________
dropout (Dropout)            (None, 125, 16)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 800       
_________________________________________________________________
dense (Dense)                (None, 8)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 18        
Total params: 185,341
Trainable params: 185,341
Non-trainable params: 0
__________________________________________________

In [18]:
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-6 * 10 ** epoch)

In [19]:
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=['accuracy'])

In [20]:
history = model.fit(X_train, y_train, epochs=5, batch_size=64, callbacks=[lr_scheduler])

2022-01-06 06:08:56.631746: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
# Evaluating the validation set

model.evaluate(X_test, y_test, batch_size=64)



[0.10491311550140381, 0.9640175104141235]

# Predicting on test data

In [22]:
# Loading the test data

test = pd.read_csv("../input/twitter-sentiment-analysis-hatred-speech/test.csv")
test.head()


Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [23]:
test1 = test.copy()

In [24]:
test_tweets = []
for tweet in test['tweet']:
    token_list = tokenizer.texts_to_sequences([tweet])[0]
    test_tweets.append(token_list)
test_tweets = np.array(pad_sequences(test_tweets, maxlen=max_seq_len, padding='pre'))

In [25]:
print(test_tweets)

[[    0     0     0 ...  1165  6704   105]
 [    0     0     0 ...    21     9   284]
 [    0     0     0 ...   110    31   187]
 ...
 [    0     0     0 ...    35 11600   200]
 [    0     0     0 ...   602    24   263]
 [    0     0     0 ...  1427  2827  8327]]


In [26]:
pred = model.predict(test_tweets)
print(pred)

[[9.6991694e-01 3.0083051e-02]
 [6.4476269e-01 3.5523722e-01]
 [9.9998903e-01 1.1015830e-05]
 ...
 [7.4142301e-01 2.5857702e-01]
 [9.9999344e-01 6.4978190e-06]
 [9.9993551e-01 6.4458371e-05]]


In [27]:
pre = []
for p in pred:
    pre.append(np.argmax(p))
pre = np.array(pre)

In [28]:
print(pre)

[0 0 0 ... 0 0 0]


In [29]:
result = pd.DataFrame(pre, columns = ['label'])

In [30]:
result['label'].value_counts()

0    16349
1      848
Name: label, dtype: int64

In [31]:
result['tweet']= test1['tweet']
result

Unnamed: 0,label,tweet
0,0,#studiolife #aislife #requires #passion #dedic...
1,0,@user #white #supremacists want everyone to s...
2,0,safe ways to heal your #acne!! #altwaystohe...
3,0,is the hp and the cursed child book up for res...
4,0,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,1,thought factory: left-right polarisation! #tru...
17193,0,feeling like a mermaid ð #hairflip #neverre...
17194,0,#hillary #campaigned today in #ohio((omg)) &am...
17195,0,"happy, at work conference: right mindset leads..."


In [32]:
print(test['tweet'][0])

#studiolife #aislife #requires #passion #dedication #willpower   to find #newmaterialsâ¦ 


In [33]:
result["id"] = test1["id"]

In [34]:
result.loc[result["label"]==1]

Unnamed: 0,label,tweet,id
19,1,thought factory: bbc neutrality on right wing ...,31982
26,1,chick gets fucked hottest naked lady,31989
33,1,suppo the #taiji fisherman! no bullying! no ra...,31996
34,1,i say we because i'm speaking collectively. i'...,31997
110,1,hey @user - a $14000 ivanka bracelet? do you f...,32073
...,...,...,...
17128,1,@user the uk governmentâs new #anti-semitism...,49091
17148,1,we grew up fucked upð¤ its fucked upð¥ i'm...,49111
17176,1,@user @user are the most racist pay ever!!!!!,49139
17188,1,"black professor demonizes, proposes nazi style...",49151


In [35]:
print(test['tweet'][75])

 @user thrilled to be working with @user over the coming months - more announcements very soon #2faceddancecompany 


In [36]:
import re
import string
from nltk.corpus import stopwords
from keras.preprocessing import sequence
stopword=set(stopwords.words('english'))
stemmer = nltk.SnowballStemmer("english")

In [37]:
test_data = 'I love my country'
def clean_text(text):
    print(text)
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    print(text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
test_data=[clean_text(test_data)]
print(test_data)
seq = tokenizer.texts_to_sequences(test_data)
padded = sequence.pad_sequences(seq, maxlen=300)
print(seq)
predi = model.predict(padded)
print("predi", predi)
if predi[0][1]<0.5:
    print("no hate")
else:
    print("hate and abusive")

I love my country
i love my country
['love countri']
[[145]]
predi [[0.5565662  0.44343385]]
no hate
