In [41]:
import numpy as np
import pandas as pd

In [97]:
path = "/kaggle/input/binary-classification-dataset-for-crime-headline/CrimeVsNoCrimeArticles.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,title,is_crime_report
0,What's New and Cool in the Fitness Sphere?,0
1,"'Today I Die,' Says Gang Leader Who Killed Sel...",1
2,Zero Jail Time For Cop Who Assaulted Disabled ...,1
3,Three Easy Gratitude Lessons,0
4,Can Change At UVA Make Campuses A Safer Place?,0


In [105]:
df.columns = {'HeadLine' , 'result'}
df.head()

Unnamed: 0,HeadLine,result
0,What's New and Cool in the Fitness Sphere?,0
1,"'Today I Die,' Says Gang Leader Who Killed Sel...",1
2,Zero Jail Time For Cop Who Assaulted Disabled ...,1
3,Three Easy Gratitude Lessons,0
4,Can Change At UVA Make Campuses A Safer Place?,0


In [106]:
df['result'].value_counts()

result
0    3562
1    3562
Name: count, dtype: int64

In [107]:
df.shape

(7124, 2)

In [109]:
df.isnull().sum()

HeadLine    1
result      0
dtype: int64

In [110]:
df.dropna(inplace=True)

In [111]:
df.isnull().sum()


HeadLine    0
result      0
dtype: int64

**Word2Vec Embeddings with Logistic Regression**

In [113]:
def get_tokens(text):
    text = text.lower()
    
    tokens = text.split()
    return tokens

df['tokens'] = df['HeadLine'].apply(get_tokens)

In [114]:
df.head()

Unnamed: 0,HeadLine,result,tokens
0,What's New and Cool in the Fitness Sphere?,0,"[what's, new, and, cool, in, the, fitness, sph..."
1,"'Today I Die,' Says Gang Leader Who Killed Sel...",1,"['today, i, die,', says, gang, leader, who, ki..."
2,Zero Jail Time For Cop Who Assaulted Disabled ...,1,"[zero, jail, time, for, cop, who, assaulted, d..."
3,Three Easy Gratitude Lessons,0,"[three, easy, gratitude, lessons]"
4,Can Change At UVA Make Campuses A Safer Place?,0,"[can, change, at, uva, make, campuses, a, safe..."


In [115]:
from gensim.models import Word2Vec
model = Word2Vec(df['tokens'],vector_size=100,window=5,min_count=1,workers=10)

In [117]:
model.wv.similarity("die","jail")

0.9882521

In [118]:
len(model.wv.index_to_key)

15817

In [119]:
def get_vec(text):
    count = 0;
    vec = np.zeros(100)
    for word in text:
        if word in model.wv.index_to_key:
            vec = vec + model.wv[word]
            count+=1
    if count > 0:
        vec = vec / count
    return vec

X = np.array([get_vec(text) for text in df["tokens"]])
y = df['result']

In [120]:
X.shape

(7123, 100)

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.67      0.70      0.69       725
           1       0.68      0.64      0.66       700

    accuracy                           0.67      1425
   macro avg       0.67      0.67      0.67      1425
weighted avg       0.67      0.67      0.67      1425

0.6729824561403509


**Word2Vec Embeddings with RandomForestClassifier**

In [123]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train,y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.74      0.83      0.78       725
           1       0.80      0.70      0.75       700

    accuracy                           0.77      1425
   macro avg       0.77      0.77      0.77      1425
weighted avg       0.77      0.77      0.77      1425

0.7670175438596492


**LSTM**

In [125]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
stop_words = set(stopwords.words('english'))

In [126]:
def preprocess_text(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum()]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned'] = df['HeadLine'].apply(preprocess_text)

In [127]:
df.head()

Unnamed: 0,HeadLine,result,tokens,cleaned
0,What's New and Cool in the Fitness Sphere?,0,"[what's, new, and, cool, in, the, fitness, sph...",new cool fitness sphere
1,"'Today I Die,' Says Gang Leader Who Killed Sel...",1,"['today, i, die,', says, gang, leader, who, ki...",die says gang leader killed self shooting fire...
2,Zero Jail Time For Cop Who Assaulted Disabled ...,1,"[zero, jail, time, for, cop, who, assaulted, d...",zero jail time cop assaulted disabled man hosp...
3,Three Easy Gratitude Lessons,0,"[three, easy, gratitude, lessons]",three easy gratitude lessons
4,Can Change At UVA Make Campuses A Safer Place?,0,"[can, change, at, uva, make, campuses, a, safe...",change uva make campuses safer place


In [128]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [129]:
##tokenization
max_words=6000 ##vocabulary size
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned'])
sequences = tokenizer.texts_to_sequences(df['cleaned'])

In [130]:
maxLen = 0
for sequence in sequences:
    maxLen = max(maxLen,len(sequence))
maxLen

13

In [131]:
padded_sequences = pad_sequences(sequences, maxlen=maxLen)

In [132]:
X_train,X_test,y_train,y_test=train_test_split(padded_sequences,df['result'],test_size=0.2,random_state=42)


In [133]:
X_train.shape

(5698, 13)

In [134]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout

model=Sequential([
    Embedding(input_dim=max_words,output_dim=200,input_length=maxLen),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])




In [135]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=64,
    verbose=2
)

Epoch 1/15
90/90 - 14s - 156ms/step - accuracy: 0.7306 - loss: 0.5116 - val_accuracy: 0.8926 - val_loss: 0.2873
Epoch 2/15
90/90 - 4s - 49ms/step - accuracy: 0.9366 - loss: 0.1882 - val_accuracy: 0.8800 - val_loss: 0.3795
Epoch 3/15
90/90 - 5s - 50ms/step - accuracy: 0.9730 - loss: 0.0890 - val_accuracy: 0.8807 - val_loss: 0.3978
Epoch 4/15
90/90 - 5s - 51ms/step - accuracy: 0.9840 - loss: 0.0591 - val_accuracy: 0.8723 - val_loss: 0.6218
Epoch 5/15
90/90 - 4s - 49ms/step - accuracy: 0.9895 - loss: 0.0313 - val_accuracy: 0.8660 - val_loss: 0.6831
Epoch 6/15
90/90 - 4s - 49ms/step - accuracy: 0.9947 - loss: 0.0174 - val_accuracy: 0.8611 - val_loss: 0.7644
Epoch 7/15
90/90 - 4s - 50ms/step - accuracy: 0.9954 - loss: 0.0145 - val_accuracy: 0.8554 - val_loss: 0.8805
Epoch 8/15
90/90 - 4s - 49ms/step - accuracy: 0.9974 - loss: 0.0096 - val_accuracy: 0.8568 - val_loss: 0.9635
Epoch 9/15
90/90 - 4s - 49ms/step - accuracy: 0.9963 - loss: 0.0100 - val_accuracy: 0.8561 - val_loss: 0.9651
Epoch 10

In [136]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.8561
