In [2]:
pip install pandas numpy nltk scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

nltk.download('stopwords')
nltk.download('wordnet')

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Text preprocessing function
def preprocess_text(text):
    if isinstance(text, float):
        return ""
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    
    return ' '.join(tokens)

# Apply preprocessing to the datasets
train_data['clean_text'] = train_data['crimeaditionalinfo'].apply(preprocess_text)
test_data['clean_text'] = test_data['crimeaditionalinfo'].apply(preprocess_text)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['clean_text']).toarray()
X_test = vectorizer.transform(test_data['clean_text']).toarray()
y_train = train_data['category']

# Model training with Logistic Regression
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(test_data['category'], y_pred)
precision = precision_score(test_data['category'], y_pred, average='weighted')
recall = recall_score(test_data['category'], y_pred, average='weighted')
f1 = f1_score(test_data['category'], y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

# Save preprocessed data and predictions
train_data.to_csv('preprocessed_train.csv', index=False)
test_data['predictions'] = y_pred
test_data.to_csv('predictions_test.csv', index=False)

# Save preprocessed data with sub-category and category
preprocessed_data = test_data[['crimeaditionalinfo', 'clean_text', 'category', 'predictions']]
preprocessed_data.to_csv('final_preprocessed_data.csv', index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Balajisrinath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Balajisrinath\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.7557398571840277
Precision: 0.7154874223941637
Recall: 0.7557398571840277
F1-Score: 0.7226077395077709
