In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [44]:
df = pd.read_csv("combined.csv")

In [45]:
df['crimeaditionalinfo'].fillna('',inplace=True)
df['sub_category'].fillna('unknown', inplace=True)


In [46]:
## removing all special characters

df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

In [47]:
##lower-casing all the sentences
df['crimeaditionalinfo']=df['crimeaditionalinfo'].str.lower()
df

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...
1,1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...
2,2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...
3,3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...
4,4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,a lady named rashmi probably a fake name had c...
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,i am mr chokhe ram two pers mobile number wer...
124912,124912,Any Other Cyber Crime,Other,mai bibekbraj maine pahle ki complain kar chuk...
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,received url link for updating kyc from mobile...


In [48]:
#tokenization
from nltk.tokenize import word_tokenize

In [49]:
df['crimeaditionalinfo']= df['crimeaditionalinfo'].apply(word_tokenize)

In [50]:
##stop word removal
## the data also contains Hinglish,other indain languages
#source : https://github.com/TrigonaMinima/HinglishNLP/blob/master/data/assets/stop_hinglish


# Read the words from the file and convert them into the required format
file_path = 'stop_hinglish.txt'
with open(file_path, 'r', encoding='utf-8') as file:
        words = [line.strip() for line in file if line.strip()]
formatted_set = "set([\n    '" + "', '".join(words) + "'\n])"
print(formatted_set)

set([
    'a', 'aadi', 'aaj', 'aap', 'aapne', 'aata', 'aati', 'aaya', 'aaye', 'ab', 'abbe', 'abbey', 'abe', 'abhi', 'able', 'about', 'above', 'accha', 'according', 'accordingly', 'acha', 'achcha', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'agar', 'ain', 'aint', 'ain't', 'aisa', 'aise', 'aisi', 'alag', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'andar', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'ap', 'apan', 'apart', 'apna', 'apnaa', 'apne', 'apni', 'appear', 'are', 'aren', 'arent', 'aren't', 'around', 'arre', 'as', 'aside', 'ask', 'asking', 'at', 'aur', 'avum', 'aya', 'aye', 'baad', 'baar', 'bad', 'bahut', 'bana', 'banae', 'banai', 'banao', 'banaya', 'banaye', 'banayi', 'banda', 'bande', 'bandi', 'bane', 'bani', 'bas', 'bata', 'batao', 'bc', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'bef

In [51]:
from nltk.corpus import stopwords
eng = set(stopwords.words('english'))
hin = set(stopwords.words('hinglish'))

combined = eng.union(hin).union(formatted_set)

In [52]:
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: [word for word in x if word not in combined])
     

In [53]:
##stem
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: [stemmer.stem(word) for word in x])
df


Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,"[continu, receiv, random, call, abus, messag, ..."
1,1,Online Financial Fraud,Fraud CallVishing,"[fraudster, continu, messag, pay, money, send,..."
2,2,Online Gambling Betting,Online Gambling Betting,"[act, polic, demand, money, ad, section, text,..."
3,3,Online and Social Media Related Crime,Online Job Fraud,"[job, appli, job, interview, telecal, resourc,..."
4,4,Online Financial Fraud,Fraud CallVishing,"[receiv, call, ladi, state, send, phone, vivo,..."
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,"[ladi, name, rashmi, probabl, fake, call, day,..."
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,"[mr, chokh, ram, per, mobil, number, found, go..."
124912,124912,Any Other Cyber Crime,Other,"[bibekbraj, pahl, complain, chuka, financi, fr..."
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,"[receiv, url, link, updat, kyc, mobil, open, r..."


In [54]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x:[lemmatizer.lemmatize(i) for i in x])

In [55]:
#tokens to str
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: ' '.join(x))
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['crimeaditionalinfo']=df['crimeaditionalinfo'].apply(lambda x: x.strip())
df

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,continu receiv random call abus messag whatsap...
1,1,Online Financial Fraud,Fraud CallVishing,fraudster continu messag pay money send fake c...
2,2,Online Gambling Betting,Online Gambling Betting,act polic demand money ad section text messag ...
3,3,Online and Social Media Related Crime,Online Job Fraud,job appli job interview telecal resourc manag ...
4,4,Online Financial Fraud,Fraud CallVishing,receiv call ladi state send phone vivo receiv ...
...,...,...,...,...
124910,124910,Online and Social Media Related Crime,Online Matrimonial Fraud,ladi name rashmi probabl fake call day ago tol...
124911,124911,Online Financial Fraud,Internet Banking Related Fraud,mr chokh ram per mobil number found gool icici...
124912,124912,Any Other Cyber Crime,Other,bibekbraj pahl complain chuka financi fraud gy...
124913,124913,Online Financial Fraud,Internet Banking Related Fraud,receiv url link updat kyc mobil open receiv ot...


In [56]:
#sorting the crimeaditionlinfo
group = df.groupby(['category','sub_category'])
sort = df.sort_values(by=['category','sub_category','crimeaditionalinfo']).reset_index(drop=True)
sort

Unnamed: 0.1,Unnamed: 0,category,sub_category,crimeaditionalinfo
0,803,Any Other Cyber Crime,Other,
1,856,Any Other Cyber Crime,Other,
2,1282,Any Other Cyber Crime,Other,
3,1300,Any Other Cyber Crime,Other,
4,1544,Any Other Cyber Crime,Other,
...,...,...,...,...
124910,83811,Sexually Obscene material,unknown,yesterday pm girl account komal roy call faceb...
124911,80842,Sexually Obscene material,unknown,yesterday rd march whatsapp receiv edit sexual...
124912,122691,Sexually Obscene material,unknown,yesterday whatsapp video call watch min person...
124913,109017,Sexually Obscene material,unknown,yhe call kr pereshan phele telegram kra asa pl...


In [57]:
sorted_df = sort.dropna()
sorted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124915 entries, 0 to 124914
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Unnamed: 0          124915 non-null  int64 
 1   category            124915 non-null  object
 2   sub_category        124915 non-null  object
 3   crimeaditionalinfo  124915 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.8+ MB


##Trainning Models

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score ,f1_score,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
import pickle

In [59]:
vectorizer = TfidfVectorizer(max_features=5000)  
X_text = vectorizer.fit_transform(df['crimeaditionalinfo'])

In [60]:
label_encoder_category = LabelEncoder()
df['category_encoded'] = label_encoder_category.fit_transform(df['category'])

label_encoder_subcategory = LabelEncoder()
df['subcategory_encoded'] = label_encoder_subcategory.fit_transform(df['sub_category'])

In [61]:
X_features = hstack([X_text, df[['category_encoded', 'subcategory_encoded']].values])

y = df[['category_encoded', 'subcategory_encoded']]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42)

In [63]:
#model = MultiOutputClassifier(RandomForestClassifier(random_state=42, n_estimators=100))
#model.fit(X_train, y_train)


from sklearn.multioutput import ClassifierChain
base_model = RandomForestClassifier(random_state=42, n_estimators=100)
model = ClassifierChain(base_model)
model.fit(X_train, y_train)


In [64]:
y_pred = model.predict(X_test)

# Calculate accuracy for each target
category_accuracy = accuracy_score(y_test.iloc[:, 0], y_pred[:, 0])
subcategory_accuracy = accuracy_score(y_test.iloc[:, 1], y_pred[:, 1])

In [65]:
print(f"Category Accuracy: {category_accuracy}")
print(f"Sub-category Accuracy: {subcategory_accuracy}")


Category Accuracy: 0.9559439626417612
Sub-category Accuracy: 0.8570246831220814


In [66]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_category = y_pred[:, 0]
y_pred_subcategory = y_pred[:, 1]

y_test_category = y_test.iloc[:, 0]
y_test_subcategory = y_test.iloc[:, 1]

# Accuracy for category and sub-category
category_accuracy = accuracy_score(y_test_category, y_pred_category)
subcategory_accuracy = accuracy_score(y_test_subcategory, y_pred_subcategory)

print(f"Category Accuracy: {category_accuracy}")
print(f"Sub-category Accuracy: {subcategory_accuracy}")

# Classification reports for category and sub-category
print("\nCategory Classification Report:")
print(classification_report(y_test_category, y_pred_category))

print("\nSub-category Classification Report:")
print(classification_report(y_test_subcategory, y_pred_subcategory))


Category Accuracy: 0.9559439626417612
Sub-category Accuracy: 0.8570246831220814

Category Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4247
           1       0.98      0.54      0.70       148
           2       0.00      0.00      0.00         2
           3       1.00      0.78      0.88       191
           4       1.00      1.00      1.00      1493
           5       1.00      0.03      0.05        75
           6       0.97      0.58      0.73       714
           7       1.00      0.04      0.07        78
           8       0.95      1.00      0.98     22929
           9       1.00      0.12      0.21       162
          10       0.92      0.94      0.93      4913
          11       1.00      0.13      0.23        23
          12       1.00      0.92      0.96      1083
          13       0.00      0.00      0.00         1
          14       0.87      0.56      0.68       662
          15       0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [67]:
# Save the model
with open("MultiOutput_RF.pkl", "wb") as file:
    pickle.dump(model, file)
    

In [68]:

from sklearn.naive_bayes import MultinomialNB

# Define the base model as MultinomialNB
base_model = MultinomialNB(alpha=0.1)

# Wrap the base model with ClassifierChain
model = MultiOutputClassifier(base_model)

# Train the model on the training data
model.fit(X_train, y_train)

In [69]:
y_pred = model.predict(X_test)

# Calculate accuracy for each target
category_accuracy = accuracy_score(y_test.iloc[:, 0], y_pred[:, 0])
subcategory_accuracy = accuracy_score(y_test.iloc[:, 1], y_pred[:, 1])

In [70]:
print(f"Category Accuracy: {category_accuracy}")
print(f"Sub-category Accuracy: {subcategory_accuracy}")


Category Accuracy: 0.8773582388258839
Sub-category Accuracy: 0.7737691794529686


In [71]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_category = y_pred[:, 0]
y_pred_subcategory = y_pred[:, 1]

y_test_category = y_test.iloc[:, 0]
y_test_subcategory = y_test.iloc[:, 1]

# Accuracy for category and sub-category
category_accuracy = accuracy_score(y_test_category, y_pred_category)
subcategory_accuracy = accuracy_score(y_test_subcategory, y_pred_subcategory)

print(f"Category Accuracy: {category_accuracy}")
print(f"Sub-category Accuracy: {subcategory_accuracy}")

# Classification reports for category and sub-category
print("\nCategory Classification Report:")
print(classification_report(y_test_category, y_pred_category))

print("\nSub-category Classification Report:")
print(classification_report(y_test_subcategory, y_pred_subcategory))


Category Accuracy: 0.8773582388258839
Sub-category Accuracy: 0.7737691794529686

Category Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4247
           1       0.99      0.97      0.98       148
           2       0.00      0.00      0.00         2
           3       0.87      0.80      0.83       191
           4       1.00      1.00      1.00      1493
           5       1.00      0.04      0.08        75
           6       0.70      0.56      0.63       714
           7       0.00      0.00      0.00        78
           8       0.91      0.94      0.92     22929
           9       0.57      0.05      0.09       162
          10       0.69      0.68      0.68      4913
          11       0.00      0.00      0.00        23
          12       1.00      0.89      0.94      1083
          13       0.00      0.00      0.00         1
          14       0.41      0.20      0.27       662
          15       0.4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [72]:

with open("MultiOutput_NB.pkl", "wb") as file:
    pickle.dump(model, file)
    