# Importing the libraries

In [92]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from keras.preprocessing.text import one_hot
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Reading dataset as a dataframe

In [2]:
dataset = pd.read_csv('train.csv', keep_default_na=False)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          3000 non-null   float64
 1   user_id           3000 non-null   float64
 2   tweet_text        3000 non-null   object 
 3   followers_count   3000 non-null   int64  
 4   following_count   3000 non-null   int64  
 5   tweet_count       3000 non-null   int64  
 6   listed_count      3000 non-null   int64  
 7   hashtags          3000 non-null   int64  
 8   mentions          3000 non-null   int64  
 9   user_description  2722 non-null   object 
 10  retweet_count     3000 non-null   int64  
 11  reply_count       3000 non-null   int64  
 12  like_count        3000 non-null   int64  
 13  quote_count       3000 non-null   int64  
 14  created_at        3000 non-null   object 
 15  label             3000 non-null   int64  
dtypes: float64(2), int64(11), object(3)
memory

# Removing irrelevant features
Write an explanation for removing these features.

---





In [3]:
dataset = dataset.drop(columns=['tweet_id', 'user_id', 'created_at', 'hashtags', 'mentions'])

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_text        3000 non-null   object
 1   followers_count   3000 non-null   int64 
 2   following_count   3000 non-null   int64 
 3   tweet_count       3000 non-null   int64 
 4   listed_count      3000 non-null   int64 
 5   hashtags          3000 non-null   int64 
 6   mentions          3000 non-null   int64 
 7   user_description  2722 non-null   object
 8   retweet_count     3000 non-null   int64 
 9   reply_count       3000 non-null   int64 
 10  like_count        3000 non-null   int64 
 11  quote_count       3000 non-null   int64 
 12  label             3000 non-null   int64 
dtypes: int64(11), object(2)
memory usage: 304.8+ KB


In [4]:
# Convert null values in dataset.user_description to empty strings
def replace_nan_with_empty_str(s):
    if isinstance(s, str): return s
    else: return ''

dataset['user_description'] = dataset['user_description'].apply(replace_nan_with_empty_str)
print(dataset['user_description'].info(), dataset['user_description'])

<class 'pandas.core.series.Series'>
RangeIndex: 3000 entries, 0 to 2999
Series name: user_description
Non-Null Count  Dtype 
--------------  ----- 
3000 non-null   object
dtypes: object(1)
memory usage: 23.6+ KB
None 0                 i believe in pixy and kingdom supremacy
1       bot made by @GNCbinary ! I am programmed to re...
2       author | rep. @MaximusLiterary |\nomnist | Pan...
3                 loving #StrayKids is undeniable \n👁️👄👁️
4       I am a happy-go-lucky kind of man. Happy to ch...
                              ...                        
2995    natasha romanoff with a lesbian flag wrapped a...
2996                           #우주소녀’s psych ward escapee
2997    🇧🇷 🇨🇦 • Achillean 💚 • Proship 🌈🛳 • QUEER & TRA...
2998                                                     
2999    📚🏒 🏉 🥃 💪 Daisies are my favorite flower. Humor...
Name: user_description, Length: 3000, dtype: object


In [5]:
# func is a function with single input and a single ouput
def apply(cols, func, df):
    for col in cols:
        df[col] = df[col].apply(func)

def make_csv(cols, file_name, frame):
    df = None
    if len(cols)!=0:
        df = frame[cols[0]].to_frame()
        # for col in cols:
        #     df[col] = frame[col]
        for i in range(1, len(cols)):
            df[cols[i]] = frame[cols[i]]

        df.to_csv(f'{file_name}.csv')

# Text Preprocessing
Write about various techniques used in it by using various text blocks

 ## Removing URLs

In [6]:
def remove_urls(text):
    url = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

    return re.sub(url,'', text)


apply(['tweet_text', 'user_description'], remove_urls, dataset)
display(dataset['tweet_text'], dataset['user_description'])

0                                         i hate my life 
1       There are notable differences between us, but ...
2       I keep having this reoccurring bad dream. It a...
3       I dont want to make yall think im seeking for ...
4       @Andyb56 @ashlea_robyn People like you dont de...
                              ...                        
2995       colin gets to come home to her i hate my life 
2996    exy pm:\nbut the fans who subscribe to WJSN\ny...
2997    Radical inclusivity made me a genuinely better...
2998    #JUSTICEforNEETUG we want justice! @DG_NTA you...
2999    Surviving #Droughtlander ~ Day 44\n\nLet’s dis...
Name: tweet_text, Length: 3000, dtype: object

0                 i believe in pixy and kingdom supremacy
1       bot made by @GNCbinary ! I am programmed to re...
2       author | rep. @MaximusLiterary |\nomnist | Pan...
3                 loving #StrayKids is undeniable \n👁️👄👁️
4       I am a happy-go-lucky kind of man. Happy to ch...
                              ...                        
2995    natasha romanoff with a lesbian flag wrapped a...
2996                           #우주소녀’s psych ward escapee
2997    🇧🇷 🇨🇦 • Achillean 💚 • Proship 🌈🛳 • QUEER & TRA...
2998                                                     
2999    📚🏒 🏉 🥃 💪 Daisies are my favorite flower. Humor...
Name: user_description, Length: 3000, dtype: object

<h2> Removing mentions </h2>

In [84]:
def remove_mentions(text):
    return re.sub(r'@([A-Za-z0-9_]{4,15})', '', text)

apply(['tweet_text', 'user_description'], remove_mentions, dataset)
display(dataset['tweet_text'])

<h2> Removing hashtags </h2>

In [85]:
def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

apply(['tweet_text', 'user_description'], remove_hashtags, dataset)
display(dataset['tweet_text'])

<h2> Emoji Filtering </h2>

In [88]:
import advertools as adv

# Fields of emoji attributes have no duplicates and are always sorted for sane processing
def filter_emojis(col, new_col, df):
    emojis = adv.extract_emoji(df[col])['emoji']
    df[new_col] = pd.Series(''.join(sorted(set(l))) for l in emojis)


    for i, s in enumerate(df[col]):
        if df[new_col][i]:
            pattern = re.compile(f'[{df[new_col][i]}]')
            df.loc[i, col] = pattern.sub('', s)


filter_emojis('tweet_text', 'tweet_text_emoji', dataset)
filter_emojis('user_description', 'user_description_emoji', dataset)

<h2> Translating non-english text to english </h2>

In [10]:
from deep_translator import GoogleTranslator

translator = GoogleTranslator(source='auto', target='en')
apply(['tweet_text', 'user_description'], translator.translate, dataset)
display(dataset)

Unnamed: 0,tweet_text,followers_count,following_count,tweet_count,listed_count,hashtags,mentions,user_description,retweet_count,reply_count,like_count,quote_count,label,tweet_text_emoji,user_description_emoji
0,i hate my life,605,474,6715,6,0,0,i believe in pixy and kingdom supremacy,33,7,123,10,1,,
1,"There are notable differences between us, but ...",5,1,23148,0,0,0,bot made by ! I am programmed to remind you t...,0,0,0,0,0,,🤖
2,I keep having this reoccurring bad dream. It a...,4324,4987,3039,30,0,0,author | rep. |\nomnist | PanWitch | \naddict...,1,1,8,0,1,🙃,✌🏼🔆
3,I dont want to make yall think im seeking for ...,7,56,995,0,0,0,loving is undeniable,0,1,0,0,1,,👁️👄
4,People like you dont deserve the country you w...,2918,4967,10043,3,0,1,I am a happy-go-lucky kind of man. Happy to ch...,0,0,3,0,0,,🇬🇧🇺🇸
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,colin gets to come home to her i hate my life,5247,2272,49356,51,0,0,natasha romanoff with a lesbian flag wrapped a...,0,0,24,0,0,,
2996,exy pm:\nbut the fans who subscribe to WJSN\ny...,2552,288,2545,9,0,0,’s psych ward escapee,87,0,306,24,0,,
2997,Radical inclusivity made me a genuinely better...,5529,311,95020,29,0,0,• Achillean • Proship • QUEER & TRANSGRESSIV...,49,1,177,0,0,,☢️🇧🇷🇨🇦🌈💚🔞🛳
2998,we want justice! you can't escape now !,7,6,319,0,1,1,,15,0,2,0,0,🔥,


<h2> Other text preprocessing </h2>

In [39]:
# Converts all letters to lowercase, stopword removal, stemming, removes everything except letters
from nltk.stem.porter import PorterStemmer

def preprocess_text(txt):
    ps = PorterStemmer()
    review = re.sub('[^a-zA-Z]', ' ', txt).lower().split()
    return ' '.join([ps.stem(word) for word in review if not word in stopwords.words('english')])

apply(['tweet_text', 'user_description'], preprocess_text, dataset)
display(dataset)

Unnamed: 0,tweet_text,followers_count,following_count,tweet_count,listed_count,hashtags,mentions,user_description,retweet_count,reply_count,like_count,quote_count,label,tweet_text_emoji,user_description_emoji
0,hate life,605,474,6715,6,0,0,believ pixi kingdom supremaci,33,7,123,10,1,,
1,notabl differ us noth asham talent possibl save,5,1,23148,0,0,0,bot made program remind take care everi minut,0,0,0,0,0,,🤖
2,keep reoccur bad dream alway make sooo piss wa...,4324,4987,3039,30,0,0,author rep omnist panwitch addict life,1,1,8,0,1,🙃,✌🏼🔆
3,dont want make yall think im seek attent ye in...,7,56,995,0,0,0,love undeni,0,1,0,0,1,,👁️👄
4,peopl like dont deserv countri rais peopl like...,2918,4967,10043,3,0,1,happi go lucki kind man happi chat discuss iss...,0,0,3,0,0,,🇬🇧🇺🇸
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,colin get come home hate life,5247,2272,49356,51,0,0,natasha romanoff lesbian flag wrap around prid...,0,0,24,0,0,,
2996,exi pm fan subscrib wjsn subscrib artist free ...,2552,288,2545,9,0,0,psych ward escape,87,0,306,24,0,,
2997,radic inclus made genuin better person spend h...,5529,311,95020,29,0,0,achillean proship queer transgress art nsfw ch...,49,1,177,0,0,,☢️🇧🇷🇨🇦🌈💚🔞🛳
2998,want justic escap,7,6,319,0,1,1,,15,0,2,0,0,🔥,


<h1> Converting textual attributes to one hot representation vectors </h1>

In [71]:
# might need to be increased
voc_size = 10**4
corpus = [txt for txt in dataset['tweet_text']]# + [txt for txt in dataset['user_description'] if txt]
corpus

['hate life',
 'notabl differ us noth asham talent possibl save',
 'keep reoccur bad dream alway make sooo piss wake instead morn b unreason one love like sit garag feel better',
 'dont want make yall think im seek attent ye inde overdramat ye insecur lack love hate admit want declar talk mom know unstabl yet ignor fact',
 'peopl like dont deserv countri rais peopl like turn blind eye industri scale sexual assault children place like rotherham sake divers million pound day spend hotel bill pfffft',
 'royan institut provid comprehens servic infertil treatment regen medicin cell therapi product recombin protein develop biolog product mighti scientist list world best',
 'modern calcul emoji feel numer discrimin make rad ical like texa instrument degre deg ener',
 'life updat think bro doesnt like much tire school feel un athlet hopeless think bout text someon text unemploy',
 'take moment appreci dissapoint thing life',
 'sad misogyni allow compass women lowest less r pist prosecut even w

In [72]:
onehot_repr=[one_hot(txt,voc_size) for txt in corpus]
onehot_repr

[[5972, 5746],
 [6315, 637, 1009, 1944, 3062, 882, 2552, 2334],
 [9885,
  9373,
  8590,
  2036,
  4870,
  7560,
  7869,
  7779,
  323,
  4834,
  7006,
  7116,
  5661,
  6170,
  7862,
  605,
  6342,
  2528,
  9106,
  6269],
 [8753,
  2072,
  7560,
  9534,
  5740,
  539,
  8161,
  9042,
  2320,
  760,
  7881,
  2320,
  9481,
  3446,
  7862,
  5972,
  4978,
  2072,
  3188,
  3518,
  9426,
  2825,
  9455,
  6517,
  7596,
  737],
 [8571,
  605,
  8753,
  8951,
  5261,
  5285,
  8571,
  605,
  1168,
  8762,
  1345,
  2067,
  9036,
  883,
  5645,
  4560,
  1620,
  605,
  6516,
  9854,
  9339,
  2892,
  3269,
  325,
  4084,
  8347,
  3171,
  4914],
 [6252,
  5385,
  823,
  7674,
  8704,
  2594,
  3227,
  159,
  6561,
  7648,
  7077,
  5581,
  7741,
  1796,
  1307,
  3345,
  5581,
  5610,
  13,
  3439,
  3517,
  5226],
 [8974,
  6327,
  7039,
  9106,
  1707,
  168,
  7560,
  9280,
  5749,
  605,
  5250,
  6266,
  5625,
  7554,
  6581],
 [5746,
  8627,
  5740,
  2150,
  4307,
  605,
  9277,
  70

In [73]:
sent_length=70
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...    0 5972 5746]
 [   0    0    0 ...  882 2552 2334]
 [   0    0    0 ... 2528 9106 6269]
 ...
 [   0    0    0 ... 9340 4855 2072]
 [   0    0    0 ... 2072 6894 8813]
 [   0    0    0 ... 3778 9478 3594]]


<h1> Implementing and training the model </h1>

In [81]:
## Creating model
embedding_vector_feature_cnt=40    # might need to be adjusted
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_feature_cnt,input_length=sent_length))
model.add(Bidirectional(LSTM(100)))    # might need to be adjusted
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 70, 40)            400000    
                                                                 
 bidirectional (Bidirectiona  (None, 200)              112800    
 l)                                                              
                                                                 
 dense_1 (Dense)             (None, 1)                 201       
                                                                 
Total params: 513,001
Trainable params: 513,001
Non-trainable params: 0
_________________________________________________________________
None


In [82]:
# Model training
X_train, X_test, y_train, y_test = train_test_split(np.array(embedded_docs), np.array(dataset['label']), test_size=0.1, random_state=42)
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=15,batch_size=64)
# labels are categorical whereas softmax returns continuous values hence probably training not appropriate

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1dcf463bf10>

<h1> Generating test data </h1>

In [89]:
test_data = pd.read_csv('test.csv', keep_default_na=False)

apply(['tweet_text'], replace_nan_with_empty_str, test_data)
apply(['tweet_text'], remove_urls, test_data)
apply(['tweet_text'], remove_mentions, test_data)
apply(['tweet_text'], remove_hashtags, test_data)
filter_emojis('tweet_text', 'tweet_text_emoji', test_data)
apply(['tweet_text'], translator.translate, test_data)
apply(['tweet_text'], preprocess_text, test_data)

onehot_repr=[one_hot(txt,voc_size) for txt in test_data['tweet_text']]
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
embedded_docs

array([[   0,    0,    0, ..., 2542,  325,  325],
       [   0,    0,    0, ..., 6304, 5197, 9924],
       [   0,    0,    0, ..., 2225, 4386, 9241],
       ...,
       [   0,    0,    0, ..., 8096, 7374, 2225],
       [   0,    0,    0, ..., 4371, 4593, 3428],
       [   0,    0,    0, ..., 4182, 5972, 3456]])

In [102]:
y_pred = model.predict(embedded_docs)
predictions = []
for y in y_pred:
    if y < 0.5: predictions.append(0)
    else : predictions.append(1)

confusion_matrix(np.array(test_data['label']),predictions)



array([[192,  11],
       [ 61,  36]], dtype=int64)

In [104]:
accuracy_score(np.array(test_data['label']),predictions)

0.76