In [2]:

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [3]:
# 2_feature_engineering.ipynb
# Simplified feature engineering pipeline for NeSy Protect project


# === Setup Paths ===
BASE_DIR = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(BASE_DIR, 'data', 'interim', "C:/Users/rahul/Desktop/Deeplcapstone/data/interim/cleaned_india_terrorism.csv")
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
os.makedirs(PROCESSED_DIR, exist_ok=True)

# === Load Data ===
df = pd.read_csv(DATA_PATH)

# === Generate Threat Level Labels (Binary) ===
df['threat_level'] = df['casualties'].apply(lambda x: 'High' if x >= 5 else 'Low')

# === Encode Labels ===
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['threat_level'])  # High=1, Low=0

# === Text Vectorization ===
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_text = tfidf.fit_transform(df['summary'].astype(str)).toarray()

# === Save TF-IDF model ===
with open(os.path.join(PROCESSED_DIR, 'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf, f)

# === Save Label Encoder ===
with open(os.path.join(PROCESSED_DIR, 'label_encoder.pkl'), 'wb') as f:
    pickle.dump(label_encoder, f)

# === Train-Test Split ===
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

# === Save Features ===
np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train)
np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

print("✅ Feature engineering complete. Files saved in /data/processed")


✅ Feature engineering complete. Files saved in /data/processed
