**Import Libraries**

In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import re

**Load Dataset**

In [68]:
df = pd.read_csv("combined.csv")

**Fill the Null values with unknown**

In [69]:
df['crimeaditionalinfo'].fillna('',inplace=True)
df['sub_category'].fillna('unknown', inplace=True)

**Removing all special character**

In [70]:

df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

**Lower-casing all the sentences**

In [71]:

df['crimeaditionalinfo']=df['crimeaditionalinfo'].str.lower()
df

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...
1,1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...
2,2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...
3,3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...
4,4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,a lady named rashmi probably a fake name had c...
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,i am mr chokhe ram two pers mobile number wer...
124912,124912,Any Other Cyber Crime,Other,mai bibekbraj maine pahle ki complain kar chuk...
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,received url link for updating kyc from mobile...


**Tokenization**

In [72]:

from nltk.tokenize import word_tokenize
df['crimeaditionalinfo']= df['crimeaditionalinfo'].apply(word_tokenize)

**Custom stop word removal**

In [73]:

## the data also contains Hinglish,other indain languages
#source : https://github.com/TrigonaMinima/HinglishNLP/blob/master/data/assets/stop_hinglish


# Read the words from the file and convert them into the required format

file_path = 'stop_hinglish.txt'
with open(file_path, 'r', encoding='utf-8') as file:
        words = [line.strip() for line in file if line.strip()]
formatted_set = "set([\n    '" + "', '".join(words) + "'\n])"
print(formatted_set)

set([
    'a', 'aadi', 'aaj', 'aap', 'aapne', 'aata', 'aati', 'aaya', 'aaye', 'ab', 'abbe', 'abbey', 'abe', 'abhi', 'able', 'about', 'above', 'accha', 'according', 'accordingly', 'acha', 'achcha', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'agar', 'ain', 'aint', 'ain't', 'aisa', 'aise', 'aisi', 'alag', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'andar', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'ap', 'apan', 'apart', 'apna', 'apnaa', 'apne', 'apni', 'appear', 'are', 'aren', 'arent', 'aren't', 'around', 'arre', 'as', 'aside', 'ask', 'asking', 'at', 'aur', 'avum', 'aya', 'aye', 'baad', 'baar', 'bad', 'bahut', 'bana', 'banae', 'banai', 'banao', 'banaya', 'banaye', 'banayi', 'banda', 'bande', 'bandi', 'bane', 'bani', 'bas', 'bata', 'batao', 'bc', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'bef

In [74]:
from nltk.corpus import stopwords
eng = set(stopwords.words('english'))
hin = set(stopwords.words('hinglish'))

combined = eng.union(hin).union(formatted_set)

In [75]:
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: [word for word in x if word not in combined])
     

**Stemming**

In [76]:

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: [stemmer.stem(word) for word in x])
df

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,"[continu, receiv, random, call, abus, messag, ..."
1,1,Online Financial Fraud,Fraud CallVishing,"[fraudster, continu, messag, pay, money, send,..."
2,2,Online Gambling Betting,Online Gambling Betting,"[act, polic, demand, money, ad, section, text,..."
3,3,Online and Social Media Related Crime,Online Job Fraud,"[job, appli, job, interview, telecal, resourc,..."
4,4,Online Financial Fraud,Fraud CallVishing,"[receiv, call, ladi, state, send, phone, vivo,..."
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,"[ladi, name, rashmi, probabl, fake, call, day,..."
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,"[mr, chokh, ram, per, mobil, number, found, go..."
124912,124912,Any Other Cyber Crime,Other,"[bibekbraj, pahl, complain, chuka, financi, fr..."
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,"[receiv, url, link, updat, kyc, mobil, open, r..."


**Lemmatization**

In [77]:

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x:[lemmatizer.lemmatize(i) for i in x])

**Tokens to string**

In [78]:

df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: ' '.join(x))
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: x.strip())


**Sorting the "crimeaditionalinfo"**

In [79]:

group = df.groupby(['category','sub_category'])
sort = df.sort_values(by=['category','sub_category','crimeaditionalinfo']).reset_index(drop=True)
sort

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,803,Any Other Cyber Crime,Other,
1,856,Any Other Cyber Crime,Other,
2,1282,Any Other Cyber Crime,Other,
3,1300,Any Other Cyber Crime,Other,
4,1544,Any Other Cyber Crime,Other,
...,...,...,...,...
124910,83811,Sexually Obscene material,unknown,yesterday pm girl account komal roy call faceb...
124911,80842,Sexually Obscene material,unknown,yesterday rd march whatsapp receiv edit sexual...
124912,122691,Sexually Obscene material,unknown,yesterday whatsapp video call watch min person...
124913,109017,Sexually Obscene material,unknown,yhe call kr pereshan phele telegram kra asa pl...


In [80]:
sorted_df = sort.dropna()
sorted_df.info()
data = sorted_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124915 entries, 0 to 124914
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Unnamed: 0          124915 non-null  int64 
 1   category            124915 non-null  object
 2   sub_category        124915 non-null  object
 3   crimeaditionalinfo  124915 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.8+ MB


**Training Models**

In [95]:
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, GRU, Dense, Bidirectional
from tensorflow.keras.utils import to_categorical

In [96]:
# Handle missing or non-string values in 'crimeaditionalinfo' column
data['crimeaditionalinfo'] = data['crimeaditionalinfo'].fillna("").astype(str)

In [97]:
# Encode 'category' and 'sub_category'
category_encoder = LabelEncoder()
sub_category_encoder = LabelEncoder()
data['category'] = category_encoder.fit_transform(data['category'])
data['sub_category'] = sub_category_encoder.fit_transform(data['sub_category'])


In [98]:
# Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['crimeaditionalinfo'])
X = tokenizer.texts_to_sequences(data['crimeaditionalinfo'])
X = pad_sequences(X, maxlen=100)

In [99]:
# One-hot encode targets
num_category_classes = len(category_encoder.classes_)
num_sub_category_classes = len(sub_category_encoder.classes_)
y_category = np.eye(num_category_classes)[data['category']]
y_sub_category = np.eye(num_sub_category_classes)[data['sub_category']]

In [100]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_category_train, y_category_test, y_sub_category_train, y_sub_category_test = train_test_split(
    X, y_category, y_sub_category, test_size=0.2, random_state=42
)

In [101]:
# Save preprocessing components
with open("tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)

with open("category_encoder.pkl", "wb") as file:
    pickle.dump(category_encoder, file)

with open("sub_category_encoder.pkl", "wb") as file:
    pickle.dump(sub_category_encoder, file)

print("Preprocessing components saved successfully.")

Preprocessing components saved successfully.


In [102]:
# Define model builder
def build_multi_output_model(model_type):
    input_layer = Input(shape=(100,))
    embedding = Embedding(input_dim=10000, output_dim=128, input_length=100)(input_layer)
    
    if model_type == 'SimpleRNN':
        x = SimpleRNN(64)(embedding)
    elif model_type == 'LSTM':
        x = LSTM(64)(embedding)
    elif model_type == 'GRU':
        x = GRU(64)(embedding)
    elif model_type == 'Bi-LSTM':
        x = Bidirectional(LSTM(64))(embedding)
    
    category_output = Dense(num_category_classes, activation='softmax', name='category_output')(x)
    sub_category_output = Dense(num_sub_category_classes, activation='softmax', name='sub_category_output')(x)
    
    model = Model(inputs=input_layer, outputs=[category_output, sub_category_output])
    model.compile(
        optimizer='adam',
        loss={
            'category_output': 'categorical_crossentropy',
            'sub_category_output': 'categorical_crossentropy'
        },
        metrics={
            'category_output': ['accuracy'],
            'sub_category_output': ['accuracy']
        }
    )
    return model

In [103]:
# Train and evaluate models
results = []
models = {}
for model_type in ['SimpleRNN', 'LSTM', 'GRU', 'Bi-LSTM']:
    print(f"Training {model_type} model...")
    model = build_multi_output_model(model_type)
    history = model.fit(
        X_train,
        {'category_output': y_category_train, 'sub_category_output': y_sub_category_train},
        epochs=10,
        batch_size=32,
        validation_split=0.3,
        verbose=1
    )
    
    # Predict on test set
    y_category_pred, y_sub_category_pred = model.predict(X_test)
    y_category_pred_classes = np.argmax(y_category_pred, axis=1)
    y_sub_category_pred_classes = np.argmax(y_sub_category_pred, axis=1)
    y_category_test_classes = np.argmax(y_category_test, axis=1)
    y_sub_category_test_classes = np.argmax(y_sub_category_test, axis=1)
    
    # Calculate metrics
    category_accuracy = accuracy_score(y_category_test_classes, y_category_pred_classes)
    sub_category_accuracy = accuracy_score(y_sub_category_test_classes, y_sub_category_pred_classes)
    category_precision = precision_score(y_category_test_classes, y_category_pred_classes, average='weighted')
    sub_category_precision = precision_score(y_sub_category_test_classes, y_sub_category_pred_classes, average='weighted')
    category_recall = recall_score(y_category_test_classes, y_category_pred_classes, average='weighted')
    sub_category_recall = recall_score(y_sub_category_test_classes, y_sub_category_pred_classes, average='weighted')
    
    # Log results
    results.append({
        'Model': model_type,
        'Category_Accuracy': category_accuracy,
        'Sub_Category_Accuracy': sub_category_accuracy,
        'Category_Precision': category_precision,
        'Sub_Category_Precision': sub_category_precision,
        'Category_Recall': category_recall,
        'Sub_Category_Recall': sub_category_recall
    })
    
    # Save model
    model.save(f"{model_type}_multi_output_model.h5")
    models[model_type] = model

Training SimpleRNN model...
Epoch 1/10




[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 10ms/step - category_output_accuracy: 0.7053 - category_output_loss: 0.9820 - loss: 3.1007 - sub_category_output_accuracy: 0.3586 - sub_category_output_loss: 2.1187 - val_category_output_accuracy: 0.7344 - val_category_output_loss: 0.7916 - val_loss: 2.5188 - val_sub_category_output_accuracy: 0.4421 - val_sub_category_output_loss: 1.7272
Epoch 2/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 13ms/step - category_output_accuracy: 0.7565 - category_output_loss: 0.7291 - loss: 2.3380 - sub_category_output_accuracy: 0.4933 - sub_category_output_loss: 1.6089 - val_category_output_accuracy: 0.7273 - val_category_output_loss: 0.8035 - val_loss: 2.4724 - val_sub_category_output_accuracy: 0.4860 - val_sub_category_output_loss: 1.6690
Epoch 3/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 12ms/step - category_output_accuracy: 0.7699 - category_output_loss: 0.6999 - loss: 2.2397 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training LSTM model...
Epoch 1/10




[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 24ms/step - category_output_accuracy: 0.7166 - category_output_loss: 0.9536 - loss: 2.9439 - sub_category_output_accuracy: 0.3855 - sub_category_output_loss: 1.9903 - val_category_output_accuracy: 0.7480 - val_category_output_loss: 0.7349 - val_loss: 2.2371 - val_sub_category_output_accuracy: 0.5291 - val_sub_category_output_loss: 1.5022
Epoch 2/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 41ms/step - category_output_accuracy: 0.7648 - category_output_loss: 0.6827 - loss: 2.1139 - sub_category_output_accuracy: 0.5504 - sub_category_output_loss: 1.4312 - val_category_output_accuracy: 0.7557 - val_category_output_loss: 0.7104 - val_loss: 2.1466 - val_sub_category_output_accuracy: 0.5480 - val_sub_category_output_loss: 1.4363
Epoch 3/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 30ms/step - category_output_accuracy: 0.7904 - category_output_loss: 0.6079 - loss: 1.8922 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training GRU model...
Epoch 1/10




[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 35ms/step - category_output_accuracy: 0.7126 - category_output_loss: 0.9594 - loss: 2.9582 - sub_category_output_accuracy: 0.3965 - sub_category_output_loss: 1.9988 - val_category_output_accuracy: 0.7542 - val_category_output_loss: 0.7121 - val_loss: 2.1748 - val_sub_category_output_accuracy: 0.5452 - val_sub_category_output_loss: 1.4628
Epoch 2/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 34ms/step - category_output_accuracy: 0.7761 - category_output_loss: 0.6511 - loss: 2.0225 - sub_category_output_accuracy: 0.5695 - sub_category_output_loss: 1.3714 - val_category_output_accuracy: 0.7524 - val_category_output_loss: 0.7086 - val_loss: 2.1295 - val_sub_category_output_accuracy: 0.5475 - val_sub_category_output_loss: 1.4210
Epoch 3/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 17ms/step - category_output_accuracy: 0.8035 - category_output_loss: 0.5690 - loss: 1.7995 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Bi-LSTM model...
Epoch 1/10




[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 31ms/step - category_output_accuracy: 0.7213 - category_output_loss: 0.9061 - loss: 2.8289 - sub_category_output_accuracy: 0.4067 - sub_category_output_loss: 1.9228 - val_category_output_accuracy: 0.7478 - val_category_output_loss: 0.7269 - val_loss: 2.2220 - val_sub_category_output_accuracy: 0.5310 - val_sub_category_output_loss: 1.4951
Epoch 2/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 31ms/step - category_output_accuracy: 0.7663 - category_output_loss: 0.6722 - loss: 2.0851 - sub_category_output_accuracy: 0.5536 - sub_category_output_loss: 1.4129 - val_category_output_accuracy: 0.7555 - val_category_output_loss: 0.7009 - val_loss: 2.1386 - val_sub_category_output_accuracy: 0.5477 - val_sub_category_output_loss: 1.4378
Epoch 3/10
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 31ms/step - category_output_accuracy: 0.7930 - category_output_loss: 0.5974 - loss: 1.8749 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [104]:
# Save tokenizer and label encoders
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('category_encoder.pkl', 'wb') as f:
    pickle.dump(category_encoder, f)

with open('sub_category_encoder.pkl', 'wb') as f:
    pickle.dump(sub_category_encoder, f)

In [105]:
# Convert results to DataFrame and display
results_df = pd.DataFrame(results).sort_values(by=['Category_Accuracy', 'Sub_Category_Accuracy'], ascending=False)
print(results_df)

       Model  Category_Accuracy  Sub_Category_Accuracy  Category_Precision  \
3    Bi-LSTM           0.721010               0.512909            0.711285   
1       LSTM           0.717528               0.508626            0.707772   
2        GRU           0.713405               0.489413            0.702869   
0  SimpleRNN           0.705840               0.475924            0.669412   

   Sub_Category_Precision  Category_Recall  Sub_Category_Recall  
3                0.494072         0.721010             0.512909  
1                0.491373         0.717528             0.508626  
2                0.475337         0.713405             0.489413  
0                0.435268         0.705840             0.475924  
