In [1]:
import warnings
warnings.filterwarnings("ignore")

import folium
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import os
import re
import contractions
from textblob import TextBlob
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from tqdm import tqdm
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\likhith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\likhith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
model = load_model("models/ConvolutionalLongShortTermMemory_model.h5",compile=False)

In [3]:
class_labels = ['age', 'ethnicity', 'gender', 'not_cyberbullying', 'religion']

In [4]:
import pickle
with open(file="models/tokens.pkl",mode="rb") as file:
    tok = pickle.load(file=file)

In [5]:
def clean_text(text):
    # expand contraction for words
    text=contractions.fix(text)
    # remove charectir emojes
    emoticons = [r':\)', r':\(', r':P']
    pattern = '|'.join(emoticons)
    text = re.sub(pattern, '', text)
    # remove mentions (@)
    text = re.sub(r'@\w+', '', text)
    # remove hashtags (#)
    text = re.sub(r'#\w+', '', text)
    # remove URLs (http and https)
    text = re.sub(r'https?://\S+', '', text)
    # remove non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)
    # remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Perform lemmatization on each word in the sentence
    blob = TextBlob(text)
    lemmatized_words = [word.lemmatize() for word in blob.words]
    # Join the lemmatized words back into a sentence
    lemmatized_sentence = " ".join(lemmatized_words)
    # convert to lowercase
    text = lemmatized_sentence.lower()
    return text

---

In [22]:
user_input_path = "user_input/test_input_1.csv"

In [23]:
df = pd.read_csv(user_input_path)
df.head(10)

Unnamed: 0,City,Lat,Lng,Text
0,Delhi,28.61,77.23,"Wow retweeting a radical extremist,antisemite ..."
1,Mumbai,19.0761,72.8775,@waffle_gurl So you decide if they are true or...
2,Kolkāta,22.5675,88.37,Dumb niggers be giving fuck abt non existing p...
3,Bangalore,12.9789,77.5917,School matters are very slowly creeping back t...
4,Chennai,13.0825,80.275,@scamp_faridxx How does the US back the Shia m...
5,Hyderābād,17.385,78.4867,Krazy-eyes Kat needs a kick in the ku...
6,Pune,18.5203,73.8567,@amesfp10 xD We should swap :L
7,Ahmedabad,23.03,72.58,More people die of drug overdose/ perscription...
8,Sūrat,21.1702,72.8311,If school wasn't so damn early I wouldn't have...
9,Prayagraj,25.4358,81.8464,"Congratulations Kat and Andre, you came second..."


In [24]:
df.shape

(25, 4)

In [25]:
cleaned_samples = []
for sample in tqdm(df['Text'].values):
    cleaned_samples.append(clean_text(sample))

100%|████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 1122.32it/s]


In [26]:
useful_data = []
for cleaned_ in cleaned_samples:
    num_data=tok.texts_to_sequences([cleaned_])
    pad_text=pad_sequences(sequences=num_data,maxlen=40,padding="post",truncating="post")
    useful_data.append(pad_text)

In [27]:
useful_data[:5]

[array([[ 392, 5216,   68,   88,  286,   35,  355,   40,  366, 1519,   40,
           39, 7830, 1099,   12, 1098,  294, 8491,   40,  369,  403,  405,
          441,   21,   64,    1,   91,   80,   61, 1038,   40,    1,  228,
           68,   43,    0,    0,    0,    0,    0]]),
 array([[  72,   12, 1536,   60,   73,  128,  304,  372,  888,  691,  532,
           60,   12,    4, 1099,  372,   80,   34,   76,   12,   91,   80,
          827, 1072,  924,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0]]),
 array([[   7,    8,  519,  527,    3,  684,  332, 3968, 3702,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0]]),
 array([[    2,   241,   128,  1172,  2829,  8145,    86,   145,    21,
         19432,   400,   116,   342,   712, 19433,   127,    51, 19434,
             0,    

# Model Prediction

In [28]:
RESULTS = []

for usefule_sample in useful_data:
    prediction = model.predict(usefule_sample)
    predicted_label = class_labels[np.argmax(prediction)]
    RESULTS.append(predicted_label)

In [29]:
df["RESULT"] = RESULTS
df.head()

Unnamed: 0,City,Lat,Lng,Text,RESULT
0,Delhi,28.61,77.23,"Wow retweeting a radical extremist,antisemite ...",religion
1,Mumbai,19.0761,72.8775,@waffle_gurl So you decide if they are true or...,not_cyberbullying
2,Kolkāta,22.5675,88.37,Dumb niggers be giving fuck abt non existing p...,ethnicity
3,Bangalore,12.9789,77.5917,School matters are very slowly creeping back t...,not_cyberbullying
4,Chennai,13.0825,80.275,@scamp_faridxx How does the US back the Shia m...,not_cyberbullying


In [30]:
df['COLORS'] = df['RESULT'].apply(lambda x: "green" if x == 'not_cyberbullying' else "red")
df.head()

Unnamed: 0,City,Lat,Lng,Text,RESULT,COLORS
0,Delhi,28.61,77.23,"Wow retweeting a radical extremist,antisemite ...",religion,red
1,Mumbai,19.0761,72.8775,@waffle_gurl So you decide if they are true or...,not_cyberbullying,green
2,Kolkāta,22.5675,88.37,Dumb niggers be giving fuck abt non existing p...,ethnicity,red
3,Bangalore,12.9789,77.5917,School matters are very slowly creeping back t...,not_cyberbullying,green
4,Chennai,13.0825,80.275,@scamp_faridxx How does the US back the Shia m...,not_cyberbullying,green


In [31]:
COMEPLETE_DETAILS = []

for i in range(len(df)):
    cur_df = df.iloc[[i]]
    cit_name = cur_df["City"].values[0]
    result = cur_df["RESULT"].values[0]
    infos = f"CITY:{cit_name} | RESULT:{result}"
    COMEPLETE_DETAILS.append(infos)
df['INFOS'] = COMEPLETE_DETAILS

In [32]:
df.head()

Unnamed: 0,City,Lat,Lng,Text,RESULT,COLORS,INFOS
0,Delhi,28.61,77.23,"Wow retweeting a radical extremist,antisemite ...",religion,red,CITY:Delhi | RESULT:religion
1,Mumbai,19.0761,72.8775,@waffle_gurl So you decide if they are true or...,not_cyberbullying,green,CITY:Mumbai | RESULT:not_cyberbullying
2,Kolkāta,22.5675,88.37,Dumb niggers be giving fuck abt non existing p...,ethnicity,red,CITY:Kolkāta | RESULT:ethnicity
3,Bangalore,12.9789,77.5917,School matters are very slowly creeping back t...,not_cyberbullying,green,CITY:Bangalore | RESULT:not_cyberbullying
4,Chennai,13.0825,80.275,@scamp_faridxx How does the US back the Shia m...,not_cyberbullying,green,CITY:Chennai | RESULT:not_cyberbullying


In [33]:
df['INFOS'][0]

'CITY:Delhi | RESULT:religion'

In [34]:
world_all_cities_colored = folium.Map(zoom_start=2,
                                      location=[13.133932434766733, 16.103938729508073])

for _, city in df.iterrows():
    folium.Marker(location=[city['Lat'], city['Lng']],
                  tooltip=city['INFOS'],
                  icon=folium.Icon(color=city['COLORS'], prefix='fa', icon='circle')).add_to(world_all_cities_colored)
    
world_all_cities_colored

---