In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

CSV_PATH = "updated_data.csv"   # new dataset
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"


In [None]:
# preprocessing

In [None]:
def preprocess(df):
    df = df.copy()
    if "Unnamed: 9" in df.columns:
        df = df.drop(columns=["Unnamed: 9"])
    
    df["text_for_nlp"] = (
        df["scheme_name"].astype(str) + ". " +
        df["details"].astype(str) + ". " +
        df["benefits"].astype(str) + ". " +
        df["eligibility"].astype(str) + ". " +
        df["application"].astype(str) + ". " +
        df["documents"].astype(str) + ". " +
        "Tags: " + df["tags"].astype(str)
    ).str.lower()
    
    return df


In [None]:
#Load, preprocess, split, and save

In [None]:
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"{CSV_PATH} not found in project folder. Please place updated_data.csv there.")
else:
    df = pd.read_csv(CSV_PATH)

df = preprocess(df)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)

print(f"✅ Data prepared: {len(train_df)} train, {len(test_df)} test.")
