In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CyberguarAI-Hackathon/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CyberguarAI-Hackathon/test.csv')


In [5]:
# Fill missing sub_category values with a placeholder
train_df['sub_category'].fillna('Unknown', inplace=True)
train_df.dropna(subset=['crimeaditionalinfo'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['sub_category'].fillna('Unknown', inplace=True)


In [6]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['cleaned_info'] = train_df['crimeaditionalinfo'].apply(clean_text)


In [7]:
#Text Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(train_df['cleaned_info'])

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder_category = LabelEncoder()
label_encoder_sub_category = LabelEncoder()

train_df['category_encoded'] = label_encoder_category.fit_transform(train_df['category'])
train_df['sub_category_encoded'] = label_encoder_sub_category.fit_transform(train_df['sub_category'])

y_category = train_df['category_encoded']
y_sub_category = train_df['sub_category_encoded']

In [9]:
from sklearn.model_selection import train_test_split

# Split data for category prediction
X_train, X_val, y_train, y_val = train_test_split(X, y_category, test_size=0.2, random_state=42)


In [10]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

model = LinearSVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred, target_names=label_encoder_category.classes_, zero_division=1))

                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.45      0.27      0.33      2142
Child Pornography CPChild Sexual Abuse Material CSAM       0.70      0.23      0.34        84
                                Cryptocurrency Crime       0.70      0.50      0.58        92
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       762
                                     Cyber Terrorism       1.00      0.00      0.00        38
      Hacking  Damage to computercomputer system etc       0.45      0.24      0.31       337
                            Online Cyber Trafficking       1.00      0.00      0.00        33
                              Online Financial Fraud       0.82      0.94      0.88     11470
                            Online Gambling  Betting       0.73      0.12      0.21        91
               Online and Social Media Related Crime       