In [1]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
# Step 1: Import necessary libraries
print("Step 1: Importing necessary libraries...")
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import warnings
import os
import gc
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(1)
# Ignore any warnings for a cleaner output
warnings.filterwarnings('ignore')

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
print("Libraries imported successfully.\n")

Step 1: Importing necessary libraries...
Libraries imported successfully.



[nltk_data] Downloading package punkt to /home/202462003/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/202462003/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Step 2: Load training and test datasets
print("Step 2: Loading training and test datasets...")
train_data_path = 'train.csv'
test_data_path = 'test.csv'

train_df = pd.read_csv(train_data_path, delimiter=',')
test_df = pd.read_csv(test_data_path, delimiter=',')
print("Datasets loaded successfully.\n")

Step 2: Loading training and test datasets...
Datasets loaded successfully.



In [4]:
train_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [5]:
test_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,RapeGang Rape RGRSexually Abusive Content,,Sir namaskar mein Ranjit Kumar PatraPaise neh...
1,Online Financial Fraud,DebitCredit Card FraudSim Swap Fraud,KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT
2,Cyber Attack/ Dependent Crimes,SQL Injection,The issue actually started when I got this ema...
3,Online Financial Fraud,Fraud CallVishing,I am amit kumar from karwi chitrakoot I am tot...
4,Any Other Cyber Crime,Other,I have ordered saree and blouse from rinki s...


In [6]:
# Step 3: Fill missing values
print("Step 3: Filling missing values in datasets...")
train_df['crimeaditionalinfo'] = train_df['crimeaditionalinfo'].fillna("")
test_df['crimeaditionalinfo'] = test_df['crimeaditionalinfo'].fillna("")
print("Missing values filled.\n")


Step 3: Filling missing values in datasets...
Missing values filled.



In [28]:
# Step 4: Define a text preprocessing function
print("Step 4: Defining and applying text preprocessing function...")

def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text).lower())  # Remove special characters and lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove digits
    words = text.split()  # Tokenize by splitting on spaces
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]  # Remove stopwords
    return ' '.join(words)

# Apply preprocessing
train_df['cleaned_text'] = train_df['crimeaditionalinfo'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['crimeaditionalinfo'].apply(preprocess_text)
print("Text preprocessing completed.\n")

Step 4: Defining and applying text preprocessing function...
Text preprocessing completed.



In [29]:
train_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo,cleaned_text
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,continue received random calls abusive message...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,fraudster continuously messaging asking pay mo...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,acting like police demanding money adding sect...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,apna job applied job interview telecalling res...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,received lady stating send new phone vivo rece...


In [30]:
test_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo,cleaned_text
0,RapeGang Rape RGRSexually Abusive Content,,Sir namaskar mein Ranjit Kumar PatraPaise neh...,sir namaskar mein ranjit kumar patrapaise nehi...
1,Online Financial Fraud,DebitCredit Card FraudSim Swap Fraud,KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT,kotak mahindra bank fraud fraud
2,Cyber Attack/ Dependent Crimes,SQL Injection,The issue actually started when I got this ema...,issue actually started got email glance like s...
3,Online Financial Fraud,Fraud CallVishing,I am amit kumar from karwi chitrakoot I am tot...,amit kumar karwi chitrakoot totally depressed ...
4,Any Other Cyber Crime,Other,I have ordered saree and blouse from rinki s...,ordered saree blouse rinki sur paid payment di...


In [31]:
# Step 6: Initialize the TF-IDF Vectorizer
print("Step 6: Initializing and applying TF-IDF Vectorizer...")

tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df['cleaned_text'])
y_train = train_df['category']
X_test = tfidf.transform(test_df['cleaned_text'])
test_labels = test_df['category']
print("TF-IDF Vectorizer applied successfully.\n")

Step 6: Initializing and applying TF-IDF Vectorizer...
TF-IDF Vectorizer applied successfully.



In [32]:
# Step 7: Split into train and validation sets
print("Step 7: Splitting data into train test and validation )...")
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print("Data split completed.\n")

Step 7: Splitting data into train test and validation )...
Data split completed.



In [33]:
# Step 8: Train the SGDClassifier on sparse data
print("Step 8: Training the SGDClassifier on training data...")
model = SGDClassifier(max_iter=10000, tol=1e-5)
model.fit(X_train_split, y_train_split)
print("Model training completed.\n")

Step 8: Training the SGDClassifier on training data...
Model training completed.



In [34]:
# Step 9: Validate the model
print("Step 9: Validating the model on validation set...")
y_pred_val = model.predict(X_val_split)
val_accuracy = accuracy_score(y_val_split, y_pred_val)
print(f"Validation Accuracy: {val_accuracy}\n")

Step 9: Validating the model on validation set...
Validation Accuracy: 0.7548297577116021



In [35]:
# Step 10: Evaluate on test data and print the classification report
print("Step 10: Evaluating model on test data...")
y_test_pred = model.predict(X_test)
print("Classification Report on Test Data:\n", classification_report(test_labels, y_test_pred))


Step 10: Evaluating model on test data...
Classification Report on Test Data:
                                                       precision    recall  f1-score   support

                               Any Other Cyber Crime       0.54      0.05      0.10      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.58      0.24      0.34       123
                      Crime Against Women & Children       0.00      0.00      0.00         4
                                Cryptocurrency Crime       0.61      0.31      0.41       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.44      0.10      0.16       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.77      0.98 

In [36]:
# Step 11: Save the model
print("Step 11: Saving the model to 'logistic_regression_model.pkl'...")
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Model saved successfully.\n")

Step 11: Saving the model to 'logistic_regression_model.pkl'...
Model saved successfully.



In [37]:
# Step 12: Save test predictions
print("Step 12: Saving test predictions to 'test_predictions.csv'...")
test_df['predicted_category'] = y_test_pred
test_df.to_csv('test_predictions.csv', index=False)
print("Test predictions saved successfully to 'test_predictions1.csv'\n")

print("All steps completed successfully.")

Step 12: Saving test predictions to 'test_predictions.csv'...
Test predictions saved successfully to 'test_predictions1.csv'

All steps completed successfully.
