# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from keras.utils import pad_sequences
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.layers import concatenate
from keras.layers import Dropout
from keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Reading dataset as a dataframe

In [2]:
dataset = pd.read_csv('train.csv', keep_default_na=False)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          3000 non-null   float64
 1   user_id           3000 non-null   float64
 2   tweet_text        3000 non-null   object 
 3   followers_count   3000 non-null   int64  
 4   following_count   3000 non-null   int64  
 5   tweet_count       3000 non-null   int64  
 6   listed_count      3000 non-null   int64  
 7   hashtags          3000 non-null   int64  
 8   mentions          3000 non-null   int64  
 9   user_description  3000 non-null   object 
 10  retweet_count     3000 non-null   int64  
 11  reply_count       3000 non-null   int64  
 12  like_count        3000 non-null   int64  
 13  quote_count       3000 non-null   int64  
 14  created_at        3000 non-null   object 
 15  label             3000 non-null   int64  
dtypes: float64(2), int64(11), object(3)
memory

# Removing irrelevant features
Write an explanation for removing these features.

---





In [3]:
dataset = dataset.drop(columns=['tweet_id', 'user_id', 'created_at', 'hashtags', 'mentions'])

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_text        3000 non-null   object
 1   followers_count   3000 non-null   int64 
 2   following_count   3000 non-null   int64 
 3   tweet_count       3000 non-null   int64 
 4   listed_count      3000 non-null   int64 
 5   user_description  3000 non-null   object
 6   retweet_count     3000 non-null   int64 
 7   reply_count       3000 non-null   int64 
 8   like_count        3000 non-null   int64 
 9   quote_count       3000 non-null   int64 
 10  label             3000 non-null   int64 
dtypes: int64(9), object(2)
memory usage: 257.9+ KB


In [4]:
# Convert null values in dataset.user_description to empty strings
def replace_nan_with_empty_str(s):
    if isinstance(s, str): return s
    else: return ''

dataset['user_description'] = dataset['user_description'].apply(replace_nan_with_empty_str)
print(dataset['user_description'].info(), dataset['user_description'])

<class 'pandas.core.series.Series'>
RangeIndex: 3000 entries, 0 to 2999
Series name: user_description
Non-Null Count  Dtype 
--------------  ----- 
3000 non-null   object
dtypes: object(1)
memory usage: 23.6+ KB
None 0                 i believe in pixy and kingdom supremacy
1       bot made by @GNCbinary ! I am programmed to re...
2       author | rep. @MaximusLiterary |\nomnist | Pan...
3                 loving #StrayKids is undeniable \n👁️👄👁️
4       I am a happy-go-lucky kind of man. Happy to ch...
                              ...                        
2995    natasha romanoff with a lesbian flag wrapped a...
2996                           #우주소녀’s psych ward escapee
2997    🇧🇷 🇨🇦 • Achillean 💚 • Proship 🌈🛳 • QUEER & TRA...
2998                                                     
2999    📚🏒 🏉 🥃 💪 Daisies are my favorite flower. Humor...
Name: user_description, Length: 3000, dtype: object


In [5]:
# func is a function with single input and a single ouput
def apply(cols, func, df):
    for col in cols:
        df[col] = df[col].apply(func)

def make_csv(cols, file_name, frame):
    df = None
    if len(cols)!=0:
        df = frame[cols[0]].to_frame()
        # for col in cols:
        #     df[col] = frame[col]
        for i in range(1, len(cols)):
            df[cols[i]] = frame[cols[i]]

        df.to_csv(f'{file_name}.csv')

# Text Preprocessing
Write about various techniques used in it by using various text blocks

 ## Removing URLs

In [6]:
def remove_urls(text):
    url = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

    return re.sub(url,'', text)


apply(['tweet_text', 'user_description'], remove_urls, dataset)
display(dataset['tweet_text'], dataset['user_description'])

0                                         i hate my life 
1       There are notable differences between us, but ...
2       I keep having this reoccurring bad dream. It a...
3       I dont want to make yall think im seeking for ...
4       @Andyb56 @ashlea_robyn People like you dont de...
                              ...                        
2995       colin gets to come home to her i hate my life 
2996    exy pm:\nbut the fans who subscribe to WJSN\ny...
2997    Radical inclusivity made me a genuinely better...
2998    #JUSTICEforNEETUG we want justice! @DG_NTA you...
2999    Surviving #Droughtlander ~ Day 44\n\nLet’s dis...
Name: tweet_text, Length: 3000, dtype: object

0                 i believe in pixy and kingdom supremacy
1       bot made by @GNCbinary ! I am programmed to re...
2       author | rep. @MaximusLiterary |\nomnist | Pan...
3                 loving #StrayKids is undeniable \n👁️👄👁️
4       I am a happy-go-lucky kind of man. Happy to ch...
                              ...                        
2995    natasha romanoff with a lesbian flag wrapped a...
2996                           #우주소녀’s psych ward escapee
2997    🇧🇷 🇨🇦 • Achillean 💚 • Proship 🌈🛳 • QUEER & TRA...
2998                                                     
2999    📚🏒 🏉 🥃 💪 Daisies are my favorite flower. Humor...
Name: user_description, Length: 3000, dtype: object

<h2> Removing mentions </h2>

In [7]:
def remove_mentions(text):
    return re.sub(r'@([A-Za-z0-9_]{4,15})', '', text)

apply(['tweet_text', 'user_description'], remove_mentions, dataset)
display(dataset['tweet_text'])

0                                         i hate my life 
1       There are notable differences between us, but ...
2       I keep having this reoccurring bad dream. It a...
3       I dont want to make yall think im seeking for ...
4         People like you dont deserve the country you...
                              ...                        
2995       colin gets to come home to her i hate my life 
2996    exy pm:\nbut the fans who subscribe to WJSN\ny...
2997    Radical inclusivity made me a genuinely better...
2998    #JUSTICEforNEETUG we want justice!  you can't ...
2999    Surviving #Droughtlander ~ Day 44\n\nLet’s dis...
Name: tweet_text, Length: 3000, dtype: object

<h2> Removing hashtags </h2>

In [8]:
def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

apply(['tweet_text', 'user_description'], remove_hashtags, dataset)
display(dataset['tweet_text'])

0                                         i hate my life 
1       There are notable differences between us, but ...
2       I keep having this reoccurring bad dream. It a...
3       I dont want to make yall think im seeking for ...
4         People like you dont deserve the country you...
                              ...                        
2995       colin gets to come home to her i hate my life 
2996    exy pm:\nbut the fans who subscribe to WJSN\ny...
2997    Radical inclusivity made me a genuinely better...
2998           we want justice!  you can't escape now ! 🔥
2999    Surviving  ~ Day 44\n\nLet’s discuss this scen...
Name: tweet_text, Length: 3000, dtype: object

<h2> Removing emojis </h2>

In [9]:
import advertools as adv

def remove_emojis(text):
    emojis = adv.extract_emoji([text])['emoji'][0]
    if len(emojis) == 0: return text
    else:
        return re.compile(''.join(set(emojis))).sub('', text)

apply(['tweet_text', 'user_description'], remove_emojis, dataset)
display(dataset)

Unnamed: 0,tweet_text,followers_count,following_count,tweet_count,listed_count,user_description,retweet_count,reply_count,like_count,quote_count,label
0,i hate my life,605,474,6715,6,i believe in pixy and kingdom supremacy,33,7,123,10,1
1,"There are notable differences between us, but ...",5,1,23148,0,bot made by ! I am programmed to remind you t...,0,0,0,0,0
2,I keep having this reoccurring bad dream. It a...,4324,4987,3039,30,author | rep. |\nomnist | PanWitch | \naddict...,1,1,8,0,1
3,I dont want to make yall think im seeking for ...,7,56,995,0,loving is undeniable \n👁️,0,1,0,0,1
4,People like you dont deserve the country you...,2918,4967,10043,3,I am a happy-go-lucky kind of man. Happy to ch...,0,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2995,colin gets to come home to her i hate my life,5247,2272,49356,51,natasha romanoff with a lesbian flag wrapped a...,0,0,24,0,0
2996,exy pm:\nbut the fans who subscribe to WJSN\ny...,2552,288,2545,9,’s psych ward escapee,87,0,306,24,0
2997,Radical inclusivity made me a genuinely better...,5529,311,95020,29,🇧🇷 🇨🇦 • Achillean 💚 • Proship 🌈🛳 • QUEER & TRA...,49,1,177,0,0
2998,we want justice! you can't escape now !,7,6,319,0,,15,0,2,0,0


<h2> Translating non-english text to english </h2>

In [16]:
from deep_translator import GoogleTranslator

translator = GoogleTranslator(source='auto', target='en')
apply(['tweet_text', 'user_description'], translator.translate, dataset)
display(dataset)

Unnamed: 0,tweet_text,followers_count,following_count,tweet_count,listed_count,user_description,retweet_count,reply_count,like_count,quote_count,label
0,i hate my life,605,474,6715,6,i believe in pixy and kingdom supremacy,33,7,123,10,1
1,"There are notable differences between us, but ...",5,1,23148,0,bot made by ! I am programmed to remind you t...,0,0,0,0,0
2,I keep having this reoccurring bad dream. It a...,4324,4987,3039,30,author | rep. |\nomnist | PanWitch | \naddict...,1,1,8,0,1
3,I dont want to make yall think im seeking for ...,7,56,995,0,loving is undeniable,0,1,0,0,1
4,People like you dont deserve the country you w...,2918,4967,10043,3,I am a happy-go-lucky kind of man. Happy to ch...,0,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2995,colin gets to come home to her i hate my life,5247,2272,49356,51,natasha romanoff with a lesbian flag wrapped a...,0,0,24,0,0
2996,exy pm:\nbut the fans who subscribe to WJSN\ny...,2552,288,2545,9,’s psych ward escapee,87,0,306,24,0
2997,Radical inclusivity made me a genuinely better...,5529,311,95020,29,• Achillean • Proship • QUEER & TRANSGRESSIV...,49,1,177,0,0
2998,we want justice! you can't escape now !,7,6,319,0,,15,0,2,0,0


<h2> Other text preprocessing </h2>

In [17]:
# Converts all letters to lowercase, stopword removal, stemming, removes everything except letters
from nltk.stem.porter import PorterStemmer

def preprocess_text(txt):
    if not txt: return txt
    ps = PorterStemmer()
    review = re.sub('[^a-zA-Z]', ' ', txt).lower().split()
    return ' '.join([ps.stem(word) for word in review if not word in stopwords.words('english')])

apply(['tweet_text', 'user_description'], preprocess_text, dataset)
display(dataset)

Unnamed: 0,tweet_text,followers_count,following_count,tweet_count,listed_count,user_description,retweet_count,reply_count,like_count,quote_count,label
0,hate life,605,474,6715,6,believ pixi kingdom supremaci,33,7,123,10,1
1,notabl differ us noth asham talent possibl save,5,1,23148,0,bot made program remind take care everi minut,0,0,0,0,0
2,keep reoccur bad dream alway make sooo piss wa...,4324,4987,3039,30,author rep omnist panwitch addict life,1,1,8,0,1
3,dont want make yall think im seek attent ye in...,7,56,995,0,love undeni,0,1,0,0,1
4,peopl like dont deserv countri rais peopl like...,2918,4967,10043,3,happi go lucki kind man happi chat discuss iss...,0,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2995,colin get come home hate life,5247,2272,49356,51,natasha romanoff lesbian flag wrap around prid...,0,0,24,0,0
2996,exi pm fan subscrib wjsn subscrib artist free ...,2552,288,2545,9,psych ward escape,87,0,306,24,0
2997,radic inclus made genuin better person spend h...,5529,311,95020,29,achillean proship queer transgress art nsfw ch...,49,1,177,0,0
2998,want justic escap,7,6,319,0,,15,0,2,0,0


<h1> Creating a corpus </h1>

In [18]:
corpus, voc_size = {'': 0}, 1
sentences = list(dataset['tweet_text']) + list(dataset['user_description'])
for sentence in sentences:
    sentence = sentence.split()
    for word in sentence:
        if word not in corpus.keys():
            corpus[word] = voc_size
            voc_size+=1

print(corpus, voc_size)

{'': 0, 'hate': 1, 'life': 2, 'notabl': 3, 'differ': 4, 'us': 5, 'noth': 6, 'asham': 7, 'talent': 8, 'possibl': 9, 'save': 10, 'keep': 11, 'reoccur': 12, 'bad': 13, 'dream': 14, 'alway': 15, 'make': 16, 'sooo': 17, 'piss': 18, 'wake': 19, 'instead': 20, 'morn': 21, 'b': 22, 'unreason': 23, 'one': 24, 'love': 25, 'like': 26, 'sit': 27, 'garag': 28, 'feel': 29, 'better': 30, 'dont': 31, 'want': 32, 'yall': 33, 'think': 34, 'im': 35, 'seek': 36, 'attent': 37, 'ye': 38, 'inde': 39, 'overdramat': 40, 'insecur': 41, 'lack': 42, 'admit': 43, 'declar': 44, 'talk': 45, 'mom': 46, 'know': 47, 'unstabl': 48, 'yet': 49, 'ignor': 50, 'fact': 51, 'peopl': 52, 'deserv': 53, 'countri': 54, 'rais': 55, 'turn': 56, 'blind': 57, 'eye': 58, 'industri': 59, 'scale': 60, 'sexual': 61, 'assault': 62, 'children': 63, 'place': 64, 'rotherham': 65, 'sake': 66, 'divers': 67, 'million': 68, 'pound': 69, 'day': 70, 'spend': 71, 'hotel': 72, 'bill': 73, 'pfffft': 74, 'royan': 75, 'institut': 76, 'provid': 77, 'comp

<h1> Converting textual attributes to one hot representation </h1>

In [19]:
sentence_len = 70
def one_hot_encoder(sentence, corpus=corpus, sentence_len=70):
    sentence = sentence.split()
    one_hot_encoding = []
    for word in sentence:
        if word in corpus.keys():
            one_hot_encoding.append(corpus[word])
        else:
            one_hot_encoding.append(0)

    return pad_sequences([one_hot_encoding], padding='pre', maxlen=sentence_len)[0]

apply(['tweet_text', 'user_description'], one_hot_encoder, dataset)
display(dataset['tweet_text'], dataset['user_description'])

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
2995    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2996    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2997    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2998    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2999    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: tweet_text, Length: 3000, dtype: object

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
2995    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2996    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2997    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2998    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2999    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: user_description, Length: 3000, dtype: object

<h1> Implementing the model </h1>

In [20]:
embedding_vector_feature_cnt=40    # might need to be adjusted
# LSTM to process tweet_text attribute
tweet_text_inputs = Input(shape=(70,))
x = Embedding(voc_size,embedding_vector_feature_cnt,input_length=sentence_len)(tweet_text_inputs)
x = Dropout(0.5)(x)
x = Bidirectional(LSTM(100))(x)
x = Dropout(0.5)(x)
tweet_text_lstm = Model(inputs=tweet_text_inputs, outputs=x)

# LSTM to process user_description attribute
user_description_inputs = Input(shape=(70,))
x = Embedding(voc_size,embedding_vector_feature_cnt,input_length=sentence_len)(user_description_inputs)
x = Dropout(0.5)(x)
x = Bidirectional(LSTM(100))(x)
x = Dropout(0.5)(x)
user_description_lstm = Model(inputs=user_description_inputs, outputs=x)

# Final layers to handle rest of the inputs and lstm outputs
rest_inputs = Input(shape=(8,))
combined = concatenate([tweet_text_lstm.output, user_description_lstm.output, rest_inputs])
x = Dense(200, activation='relu')(combined)
x = Dropout(0.5)(x)
x = Dense(100, activation='relu')(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(25, activation='relu')(x)
x = Dense(10, activation='relu')(x)
x = Dense(5, activation='relu')(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[tweet_text_lstm.input, user_description_lstm.input, rest_inputs], outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 70)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 70)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 70, 40)       417240      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 70, 40)       417240      ['input_2[0][0]']                
                                                                                            

<h1> Performing validation split </h1>

In [21]:
text_data = dataset['tweet_text'].to_frame()
text_data['user_description'] = dataset['user_description']
X_train, X_test, text_train, text_test = train_test_split(dataset.drop(columns=['tweet_text', 'user_description']), text_data, test_size=0.1, random_state=42)
y_train, y_test = X_train['label'], X_test['label']
X_train, X_test = X_train.drop(columns=['label']), X_test.drop(columns=['label'])

def type_correction(col, df):
    l = []
    for val in df[col]:
        l.append(list(val))

    return np.array(l)

tweets_train = type_correction('tweet_text', text_train)
tweets_test = type_correction('tweet_text', text_test)
desc_train = type_correction('user_description', text_train)
desc_test = type_correction('user_description', text_test)

In [22]:
# X_train_copy, X_test_copy = X_train.copy(), X_test.copy()

for col in X_train.columns:
    X_train[col] = X_train[col] / X_train[col].max()

for col in X_test.columns:
    X_test[col] = X_test[col] / X_test[col].max()

X_train

Unnamed: 0,followers_count,following_count,tweet_count,listed_count,retweet_count,reply_count,like_count,quote_count
433,1.602484e-04,0.006196,9.518387e-03,0.000148,0.000344,0.002764,0.000339,0.000000
1151,1.478567e-04,0.011578,8.014144e-03,0.000000,0.000000,0.000184,0.000023,0.000000
73,1.128436e-05,0.004388,6.509078e-03,0.000074,0.000246,0.000184,0.000090,0.000000
1536,5.431257e-06,0.000089,2.582929e-03,0.000059,0.000049,0.000921,0.000113,0.000597
2709,1.036157e-04,0.000761,1.016248e-03,0.000162,0.000000,0.000000,0.000098,0.000000
...,...,...,...,...,...,...,...,...
1638,4.000674e-04,0.005019,9.959555e-03,0.000634,0.000098,0.000000,0.000113,0.000000
1095,2.109226e-07,0.003342,9.743490e-04,0.000000,0.000000,0.000000,0.000000,0.000000
1130,2.583802e-06,0.000238,8.100406e-04,0.000015,0.000000,0.000184,0.000000,0.000000
1294,1.370997e-06,0.000214,1.100866e-04,0.000000,0.000000,0.000000,0.000030,0.000000


<h1> Training the model </h1>

In [23]:
model.fit(x = [tweets_train, desc_train, np.array(X_train)],
          y = y_train,
          validation_data=([tweets_test, desc_test, np.array(X_test)],y_test),
          epochs=15,
          batch_size=64)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x199fd53fd10>

<h1> Generating test data </h1>

In [24]:
test_data = pd.read_csv('test.csv', keep_default_na=False)
test_data = test_data.drop(columns=['tweet_id', 'user_id', 'created_at', 'hashtags', 'mentions'])
apply(['tweet_text', 'user_description'], replace_nan_with_empty_str, test_data)
apply(['tweet_text', 'user_description'], remove_urls, test_data)
apply(['tweet_text', 'user_description'], remove_mentions, test_data)
apply(['tweet_text', 'user_description'], remove_hashtags, test_data)
apply(['tweet_text', 'user_description'], remove_emojis, test_data)
apply(['tweet_text', 'user_description'], translator.translate, test_data)
# b = test_data.copy()
apply(['tweet_text', 'user_description'], replace_nan_with_empty_str, test_data)
apply(['tweet_text', 'user_description'], preprocess_text, test_data)
apply(['tweet_text', 'user_description'], one_hot_encoder, test_data)
display(test_data)

Unnamed: 0,tweet_text,followers_count,following_count,tweet_count,listed_count,user_description,retweet_count,reply_count,like_count,quote_count,label
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",606,245,22694,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,3,0,0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",97,93,62613,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,1,0,0
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",483,124,20602,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,0,4,0,1
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,96,16,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,1,0,0
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",669,693,9081,15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
295,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2451,5007,353660,38,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,1,0
296,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,163,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0
297,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,0,1734,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0
298,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",262,118,3296,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,2,53,0,0


In [25]:
tweets = type_correction('tweet_text', test_data)
desc = type_correction('user_description', test_data)

test_data = test_data.drop(columns=['tweet_text', 'user_description'])
for col in test_data.columns:
    test_data[col] = test_data[col] / test_data[col].max()

test_data

Unnamed: 0,followers_count,following_count,tweet_count,listed_count,retweet_count,reply_count,like_count,quote_count,label
0,7.588030e-06,0.012787,0.064169,0.000000,0.000000,0.000000,0.000043,0.000000,0.0
1,1.214586e-06,0.004854,0.177043,0.000035,0.000000,0.000285,0.000014,0.000000,0.0
2,6.047886e-06,0.006472,0.058254,0.000000,0.000255,0.000000,0.000058,0.000000,1.0
3,2.504300e-08,0.005010,0.000045,0.000000,0.000085,0.000000,0.000014,0.000000,0.0
4,8.376885e-06,0.036169,0.025677,0.000521,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
295,3.069020e-05,0.261326,1.000000,0.001321,0.000000,0.000000,0.000014,0.001304,0.0
296,0.000000e+00,0.000000,0.000461,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
297,3.756451e-08,0.000000,0.004903,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
298,3.280634e-06,0.006159,0.009320,0.000035,0.000595,0.000569,0.000764,0.000000,0.0


<h1> Testing the model on the test set </h1>

In [26]:
predictions = model.predict([tweets, desc, np.array(test_data.drop(columns=['label']))]).flatten()
predictions



array([5.15516483e-08, 1.13726251e-01, 5.31293213e-01, 9.33024785e-05,
       4.80390303e-02, 6.55143094e-06, 9.12297666e-01, 1.12240194e-07,
       3.79856033e-06, 1.10348850e-07, 2.13025575e-09, 1.96582605e-05,
       8.71708214e-01, 3.29040995e-05, 4.28911975e-11, 4.98984919e-05,
       4.80927646e-01, 9.02323246e-01, 1.57521252e-04, 9.71900284e-01,
       3.73896997e-04, 3.25743628e-08, 1.44731027e-08, 1.44261467e-05,
       2.55795651e-08, 5.63227331e-05, 1.56956241e-02, 1.20416462e-05,
       1.12859323e-03, 2.76521653e-01, 4.76564921e-04, 9.99949932e-01,
       2.65743751e-02, 1.38824580e-05, 1.02589190e-07, 2.41329661e-04,
       1.78555457e-03, 8.79046321e-02, 9.21043038e-01, 2.52901443e-15,
       1.28935690e-05, 2.88256910e-03, 9.78811443e-01, 4.16222289e-02,
       1.22820519e-01, 1.80891533e-08, 1.20958667e-02, 2.31424235e-02,
       1.80531383e-01, 3.85069407e-11, 8.62379134e-01, 8.62345338e-01,
       5.74131124e-03, 7.47028110e-08, 9.69766732e-03, 9.80088771e-07,
      

<h1> Evaluating the performace of the model </h1

In [27]:
pred = []
for x in predictions:
    if x < 0.5:
        pred.append(0)
    else: pred.append(1)

predictions = pred
confusion_matrix(test_data['label'], predictions)

array([[194,   9],
       [ 67,  30]], dtype=int64)

In [28]:
accuracy_score(test_data['label'], predictions)

0.7466666666666667