<a href="https://colab.research.google.com/github/Prashant-1008/Cyber_Believers/blob/main/initial_phase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Complaint Classification Using Machine Learning

This notebook demonstrates how to classify user complaints into categories and sub-categories using trained machine learning models. It includes steps to:

- Accept a complaint as input
- Transform the input using a TF-IDF vectorizer
- Predict the category and sub-category using pre-trained models


## Dependencies and Imports
Ensure you have the required libraries installed before running this notebook.

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import nltk

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Text preprocessing function
def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Lowercase
        text = text.lower()
        # Remove punctuation and digits
        text = re.sub(f'[{string.punctuation}0-9]', '', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        # Remove stopwords
        text = ' '.join([word for word in text.split() if word not in stop_words])
    else:
        text = ''  # If text is not a string (e.g., NaN), return an empty string
    return text

# Apply preprocessing to the train dataset
train['processed_text'] = train['crimeaditionalinfo'].apply(preprocess_text)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train['processed_text']).toarray()
y = train['category']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate on the validation set
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Preprocess the test data and predict
test['processed_text'] = test['crimeaditionalinfo'].apply(preprocess_text)
X_test = vectorizer.transform(test['processed_text']).toarray()
test_predictions = model.predict(X_test)

# Add predictions to the test dataframe and save to CSV
test['predicted_category'] = test_predictions
test[['predicted_category']].to_csv('test_predictions.csv', index=False)

print("Predictions saved to 'test_predictions.csv'")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Validation Accuracy: 0.7609136514035649


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.64      0.10      0.17      2091
Child Pornography CPChild Sexual Abuse Material CSAM       0.94      0.25      0.39        69
                                Cryptocurrency Crime       0.75      0.03      0.06        96
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       765
                                     Cyber Terrorism       1.00      0.03      0.06        31
      Hacking  Damage to computercomputer system etc       0.64      0.05      0.10       341
                            Online Cyber Trafficking       0.00      0.00      0.00        34
                              Online Financial Fraud       0.77      0.98      0.87     11471
                            Online Gambling  Betting       0.00      0.00      0.00        97
               Online and Social Media Related Crime       