## Load the data



In [None]:
import pandas as pd

df = pd.read_csv('delhi_crime_news_final_2015_2024.csv')
display(df.head())
display(df.info())

Unnamed: 0,crime_type,article,crime_severity
0,Theft,"On the morning of 2019-07-14, a brazen theft u...",0
1,Assault,The late-night assault in Dwarka on 2017-11-03...,1
2,Robbery,"On 2020-02-19, a high-profile robbery occurred...",1
3,Burglary,A quiet residential burglary on 2018-08-22 in ...,1
4,Vandalism,"In 2016-05-12, a spree of vandalism hit Chandn...",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   crime_type      1000 non-null   object
 1   article         1000 non-null   object
 2   crime_severity  1000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


None

## Select and load NER model


In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
doc_entities = []
for article in df['article']:
    doc = nlp(article)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    doc_entities.append(entities)

## Extract location and date




In [None]:
filtered_entities = []
for entities in doc_entities:
    article_filtered_entities = [(text, label) for text, label in entities if label in ['GPE', 'DATE']]
    filtered_entities.append(article_filtered_entities)

## Store and display results


In [None]:
locations = []
dates = []

for article_entities in filtered_entities:
    article_location = ""
    article_date = ""
    for entity_text, entity_label in article_entities:
        if entity_label == 'GPE':
            if article_location:
                article_location += ", " + entity_text
            else:
                article_location = entity_text
        elif entity_label == 'DATE':
            if article_date:
                article_date += ", " + entity_text
            else:
                article_date = entity_text
    locations.append(article_location)
    dates.append(article_date)

df['extracted_location'] = locations
df['extracted_date'] = dates

display(df.head())

Unnamed: 0,crime_type,article,crime_severity,extracted_location,extracted_date
0,Theft,"On the morning of 2019-07-14, a brazen theft u...",0,Delhi,"2019-07-14, daily, the following week"
1,Assault,The late-night assault in Dwarka on 2017-11-03...,1,"Dwarka, Delhi","2017-11-03, 29-year-old, Sector 12’s"
2,Robbery,"On 2020-02-19, a high-profile robbery occurred...",1,"Saket, Saket, South Delhi, Saket",2020-02-19
3,Burglary,A quiet residential burglary on 2018-08-22 in ...,1,Delhi,"2018-08-22, earlier in the week"
4,Vandalism,"In 2016-05-12, a spree of vandalism hit Chandn...",0,,2016-05-12


# Task
Analyze the `delhi_crime_news_details` dataset to predict `crime_severity` using an LSTM model.

## Preprocess the text data




In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['article'])
sequences = tokenizer.texts_to_sequences(df['article'])

In [None]:
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [None]:
vocabulary_size = len(tokenizer.word_index) + 1

## Split the data


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['crime_severity'], test_size=0.25, random_state=42)

## Build the LSTM model



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=128, input_length=max_sequence_length))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(len(y_train.unique()), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()



## Train the model


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 3s/step - accuracy: 0.9526 - loss: 0.1668 - val_accuracy: 0.8000 - val_loss: 0.5327
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 3s/step - accuracy: 0.9667 - loss: 0.1354 - val_accuracy: 0.8133 - val_loss: 0.6401
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - accuracy: 0.9758 - loss: 0.0920 - val_accuracy: 0.8067 - val_loss: 0.6207
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 3s/step - accuracy: 0.9848 - loss: 0.0565 - val_accuracy: 0.8000 - val_loss: 0.6536
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 3s/step - accuracy: 0.9804 - loss: 0.0674 - val_accuracy: 0.7600 - val_loss: 0.6557
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - accuracy: 0.9793 - loss: 0.0766 - val_accuracy: 0.7800 - val_loss: 0.9531


In [None]:
import pandas as pd

df = pd.read_csv('delhi_crime_news_final_2015_2024.csv')
display(df.head())
display(df.info())

Unnamed: 0,crime_type,article,crime_severity
0,Theft,"On the morning of 2019-07-14, a brazen theft u...",0
1,Assault,The late-night assault in Dwarka on 2017-11-03...,1
2,Robbery,"On 2020-02-19, a high-profile robbery occurred...",1
3,Burglary,A quiet residential burglary on 2018-08-22 in ...,1
4,Vandalism,"In 2016-05-12, a spree of vandalism hit Chandn...",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   crime_type      1000 non-null   object
 1   article         1000 non-null   object
 2   crime_severity  1000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


None

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['article'])
sequences = tokenizer.texts_to_sequences(df['article'])

In [None]:
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [None]:
vocabulary_size = len(tokenizer.word_index) + 1

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['crime_severity'], test_size=0.25, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['article'])
sequences = tokenizer.texts_to_sequences(df['article'])

NameError: name 'df' is not defined

In [None]:
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

NameError: name 'sequences' is not defined

In [None]:
vocabulary_size = len(tokenizer.word_index) + 1

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['crime_severity'], test_size=0.25, random_state=42)

NameError: name 'padded_sequences' is not defined

## Evaluate the model



In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 461ms/step - accuracy: 0.8317 - loss: 0.4865
Test Loss: 0.4409
Test Accuracy: 0.8480


## Make predictions



In [None]:
predictions = model.predict(X_test)
predicted_classes = tf.argmax(predictions, axis=1).numpy()

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 653ms/step
