In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
data = pd.read_csv('/mnt/data/new.csv')

# Preprocess text data
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

data['processed_info'] = data['processed_info'].apply(preprocess_text)

# Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.33, random_state=42)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Hyperparameters
max_vocab = 10000
embedding_dim = 64
max_length = 100

# Tokenization and padding
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(train_data['processed_info'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['processed_info']), maxlen=max_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['processed_info']), maxlen=max_length)

# Encode target labels
category_labels = train_data['category'].factorize()[0]
subcategory_labels = train_data['sub_category'].factorize()[0]

# LSTM Model for predicting both category and subcategory
input_layer = Input(shape=(max_length,))
embedding = Embedding(max_vocab, embedding_dim)(input_layer)
lstm = LSTM(128, return_sequences=True)(embedding)
lstm_output = LSTM(64)(lstm)

# Separate outputs
category_output = Dense(len(set(category_labels)), activation='softmax', name="category_output")(lstm_output)
subcategory_output = Dense(len(set(subcategory_labels)), activation='softmax', name="subcategory_output")(lstm_output)

model = Model(inputs=input_layer, outputs=[category_output, subcategory_output])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, [category_labels, subcategory_labels], epochs=5, validation_split=0.2)


In [None]:
import streamlit as st

st.title("Crime Category & Subcategory Classification")
user_input = st.text_input("Enter crime description:")
if user_input:
    preprocessed_input = preprocess_text(user_input)
    tokenized_input = pad_sequences(tokenizer.texts_to_sequences([preprocessed_input]), maxlen=max_length)
    category_pred, subcategory_pred = model.predict(tokenized_input)
    
    category = category_labels.inverse_transform([category_pred.argmax()])
    subcategory = subcategory_labels.inverse_transform([subcategory_pred.argmax()])
    
    st.write(f"Predicted Category: {category[0]}")
    st.write(f"Predicted Subcategory: {subcategory[0]}")


In [None]:
# Define function to evaluate model
def evaluate_model(model):
    results = model.evaluate(X_test, [test_data['category'], test_data['sub_category']])
    print(f"Category Accuracy: {results[1]}, Subcategory Accuracy: {results[3]}")

# Example evaluation
evaluate_model(model)  # Use different model names to evaluate each one
