**Import Libraries**

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk

**Load Dataset**

In [15]:
df = pd.read_csv("combined.csv")

**Fill the NULL values with Unknown**

In [16]:
df['crimeaditionalinfo'].fillna('',inplace=True)
df['sub_category'].fillna('unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['crimeaditionalinfo'].fillna('',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sub_category'].fillna('unknown', inplace=True)


**Removing all special characters**

In [17]:

df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

**Lower-casing all the sentences**

In [18]:

df['crimeaditionalinfo']=df['crimeaditionalinfo'].str.lower()
df

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...
1,1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...
2,2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...
3,3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...
4,4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,a lady named rashmi probably a fake name had c...
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,i am mr chokhe ram two pers mobile number wer...
124912,124912,Any Other Cyber Crime,Other,mai bibekbraj maine pahle ki complain kar chuk...
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,received url link for updating kyc from mobile...


**Tokenization**

In [25]:

from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
df['crimeaditionalinfo']= df['crimeaditionalinfo'].apply(word_tokenize)

**Custom stop word removal**

In [28]:

## the data also contains Hinglish,other indain languages
#source : https://github.com/TrigonaMinima/HinglishNLP/blob/master/data/assets/stop_hinglish


# Read the words from the file and convert them into the required format
file_path = 'stop_hinglish.txt'
with open(file_path, 'r', encoding='utf-8') as file:
        words = [line.strip() for line in file if line.strip()]
formatted_set = "set([\n    '" + "', '".join(words) + "'\n])"
print(formatted_set)

set([
    'a', 'aadi', 'aaj', 'aap', 'aapne', 'aata', 'aati', 'aaya', 'aaye', 'ab', 'abbe', 'abbey', 'abe', 'abhi', 'able', 'about', 'above', 'accha', 'according', 'accordingly', 'acha', 'achcha', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'agar', 'ain', 'aint', 'ain't', 'aisa', 'aise', 'aisi', 'alag', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'andar', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'ap', 'apan', 'apart', 'apna', 'apnaa', 'apne', 'apni', 'appear', 'are', 'aren', 'arent', 'aren't', 'around', 'arre', 'as', 'aside', 'ask', 'asking', 'at', 'aur', 'avum', 'aya', 'aye', 'baad', 'baar', 'bad', 'bahut', 'bana', 'banae', 'banai', 'banao', 'banaya', 'banaye', 'banayi', 'banda', 'bande', 'bandi', 'bane', 'bani', 'bas', 'bata', 'batao', 'bc', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'bef

In [29]:
from nltk.corpus import stopwords
eng = set(stopwords.words('english'))
hin = set(stopwords.words('hinglish'))

combined = eng.union(hin).union(formatted_set)

In [30]:
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: [word for word in x if word not in combined])
     

**Stemming**

In [31]:

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: [stemmer.stem(word) for word in x])
df


Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,"[continu, receiv, random, call, abus, messag, ..."
1,1,Online Financial Fraud,Fraud CallVishing,"[fraudster, continu, messag, pay, money, send,..."
2,2,Online Gambling Betting,Online Gambling Betting,"[act, polic, demand, money, ad, section, text,..."
3,3,Online and Social Media Related Crime,Online Job Fraud,"[job, appli, job, interview, telecal, resourc,..."
4,4,Online Financial Fraud,Fraud CallVishing,"[receiv, call, ladi, state, send, phone, vivo,..."
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,"[ladi, name, rashmi, probabl, fake, call, day,..."
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,"[mr, chokh, ram, per, mobil, number, found, go..."
124912,124912,Any Other Cyber Crime,Other,"[bibekbraj, pahl, complain, chuka, financi, fr..."
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,"[receiv, url, link, updat, kyc, mobil, open, r..."


**Lemmatization**

In [32]:

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x:[lemmatizer.lemmatize(i) for i in x])

**Tokens to string**

In [33]:

df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: ' '.join(x))
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: x.strip())
df

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,continu receiv random call abus messag whatsap...
1,1,Online Financial Fraud,Fraud CallVishing,fraudster continu messag pay money send fake c...
2,2,Online Gambling Betting,Online Gambling Betting,act polic demand money ad section text messag ...
3,3,Online and Social Media Related Crime,Online Job Fraud,job appli job interview telecal resourc manag ...
4,4,Online Financial Fraud,Fraud CallVishing,receiv call ladi state send phone vivo receiv ...
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,ladi name rashmi probabl fake call day ago tol...
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,mr chokh ram per mobil number found gool icici...
124912,124912,Any Other Cyber Crime,Other,bibekbraj pahl complain chuka financi fraud gy...
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,receiv url link updat kyc mobil open receiv ot...


**Sorting the "crimeaditionlinfo"**

In [34]:

group = df.groupby(['category','sub_category'])
sort = df.sort_values(by=['category','sub_category','crimeaditionalinfo']).reset_index(drop=True)
sort

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,803,Any Other Cyber Crime,Other,
1,856,Any Other Cyber Crime,Other,
2,1282,Any Other Cyber Crime,Other,
3,1300,Any Other Cyber Crime,Other,
4,1544,Any Other Cyber Crime,Other,
...,...,...,...,...
124910,83811,Sexually Obscene material,unknown,yesterday pm girl account komal roy call faceb...
124911,80842,Sexually Obscene material,unknown,yesterday rd march whatsapp receiv edit sexual...
124912,122691,Sexually Obscene material,unknown,yesterday whatsapp video call watch min person...
124913,109017,Sexually Obscene material,unknown,yhe call kr pereshan phele telegram kra asa pl...


In [35]:
sorted_df = sort.dropna()
sorted_df.info()
df=sorted_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124915 entries, 0 to 124914
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Unnamed: 0          124915 non-null  int64 
 1   category            124915 non-null  object
 2   sub_category        124915 non-null  object
 3   crimeaditionalinfo  124915 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.8+ MB


**Trainning Models**

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score ,f1_score,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
import pickle

**Vectorize text data**

In [52]:

vectorizer = TfidfVectorizer(max_features=5000)  
X_text = vectorizer.fit_transform(df['crimeaditionalinfo'])

**Encode category and subcategory**

In [53]:

label_encoder_category = LabelEncoder()
df['category_encoded'] = label_encoder_category.fit_transform(df['category'])

label_encoder_subcategory = LabelEncoder()
df['subcategory_encoded'] = label_encoder_subcategory.fit_transform(df['sub_category'])

**Combine features**

In [54]:

X_features = hstack([X_text, df[['category_encoded', 'subcategory_encoded']].values])

**Target variables**

In [55]:

y = df[['category_encoded', 'subcategory_encoded']]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42)


**Using RandomForest**

In [57]:
#model = MultiOutputClassifier(RandomForestClassifier(random_state=42, n_estimators=100))
#model.fit(X_train, y_train)


base_model = RandomForestClassifier(random_state=42, n_estimators=100)
model = ClassifierChain(base_model)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


**Calculate accuracy for each target**

In [60]:

category_accuracy = accuracy_score(y_test.iloc[:, 0], y_pred[:, 0])
subcategory_accuracy = accuracy_score(y_test.iloc[:, 1], y_pred[:, 1])
print(f"Category Accuracy: {category_accuracy}")
print(f"Sub-category Accuracy: {subcategory_accuracy}")

Category Accuracy: 0.9621881254169447
Sub-category Accuracy: 0.8631887925283522


In [64]:
from sklearn.metrics import accuracy_score, classification_report


# Extract predictions
y_pred_category = y_pred[:, 0]
y_pred_subcategory = y_pred[:, 1]

y_test_category = y_test.iloc[:, 0]
y_test_subcategory = y_test.iloc[:, 1]

# Decode labels to category names
y_test_category_names = label_encoder_category.inverse_transform(y_test_category)
y_pred_category_names = label_encoder_category.inverse_transform(y_pred_category.astype(int))

y_test_subcategory_names = label_encoder_subcategory.inverse_transform(y_test_subcategory)
y_pred_subcategory_names = label_encoder_subcategory.inverse_transform(y_pred_subcategory.astype(int))

# Classification reports with decoded labels
print("\nCategory Classification Report:")
print(classification_report(y_test_category_names, y_pred_category_names))

print("\nSub-category Classification Report:")
print(classification_report(y_test_subcategory_names, y_pred_subcategory_names))


Category Classification Report:
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.99      1.00      0.99      4395
Child Pornography CPChild Sexual Abuse Material CSAM       0.99      0.52      0.68       140
                                Cryptocurrency Crime       1.00      0.80      0.89       192
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1457
                                     Cyber Terrorism       1.00      0.07      0.12        60
      Hacking  Damage to computercomputer system etc       0.97      0.61      0.74       671
                            Online Cyber Trafficking       1.00      0.04      0.08        76
                              Online Financial Fraud       0.96      1.00      0.98     23054
                            Online Gambling  Betting       1.00      0.13      0.24       195
               Online and 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                                      precision    recall  f1-score   support

                             Business Email CompromiseEmail Takeover       0.80      0.04      0.07       106
                                           Cheating by Impersonation       0.85      0.64      0.73       795
                                                Cryptocurrency Fraud       0.98      0.88      0.93       192
                                   Cyber Bullying  Stalking  Sexting       0.76      0.94      0.84      1558
                                                     Cyber Terrorism       1.00      0.07      0.12        60
                             Damage to computer computer systems etc       1.00      0.26      0.41        43
                                                   Data Breach/Theft       0.20      0.21      0.21       200
                                DebitCredit Card FraudSim Swap Fraud       0.87      0.96      0.91      4345
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Save the model**

In [45]:

with open("MultiOutput_RF.pkl", "wb") as file:
    pickle.dump(model, file)
    

**Naive Bayes**

In [46]:

from sklearn.naive_bayes import MultinomialNB

# Define the base model as MultinomialNB
base_model = MultinomialNB(alpha=0.1)

# Wrap the base model with ClassifierChain
model = MultiOutputClassifier(base_model)

# Train the model on the training data
model.fit(X_train, y_train)

In [47]:
y_pred = model.predict(X_test)

# Calculate accuracy for each target
category_accuracy = accuracy_score(y_test.iloc[:, 0], y_pred[:, 0])
subcategory_accuracy = accuracy_score(y_test.iloc[:, 1], y_pred[:, 1])

In [48]:
print(f"Category Accuracy: {category_accuracy}")
print(f"Sub-category Accuracy: {subcategory_accuracy}")


Category Accuracy: 0.8804803202134757
Sub-category Accuracy: 0.7791594396264176


In [49]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_category = y_pred[:, 0]
y_pred_subcategory = y_pred[:, 1]

y_test_category = y_test.iloc[:, 0]
y_test_subcategory = y_test.iloc[:, 1]

# Accuracy for category and sub-category
category_accuracy = accuracy_score(y_test_category, y_pred_category)
subcategory_accuracy = accuracy_score(y_test_subcategory, y_pred_subcategory)

print(f"Category Accuracy: {category_accuracy}")
print(f"Sub-category Accuracy: {subcategory_accuracy}")

# Classification reports for category and sub-category
print("\nCategory Classification Report:")
print(classification_report(y_test_category, y_pred_category))

print("\nSub-category Classification Report:")
print(classification_report(y_test_subcategory, y_pred_subcategory))


Category Accuracy: 0.8804803202134757
Sub-category Accuracy: 0.7791594396264176

Category Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4395
           1       0.99      0.98      0.98       140
           3       0.85      0.81      0.83       192
           4       1.00      1.00      1.00      1457
           5       0.67      0.07      0.12        60
           6       0.67      0.55      0.60       671
           7       0.00      0.00      0.00        76
           8       0.91      0.94      0.93     23054
           9       0.40      0.03      0.06       195
          10       0.69      0.68      0.69      4790
          11       0.00      0.00      0.00        25
          12       1.00      0.91      0.95      1081
          14       0.38      0.20      0.26       628
          15       0.48      0.62      0.54       711

    accuracy                           0.88     37475
   macro avg       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Save the model**

In [50]:

with open("MultiOutput_NB.pkl", "wb") as file:
    pickle.dump(model, file)
    