# Building the Model

## Import and Loading Data

In [29]:
import pandas as pd
import sys
import os
from sklearn.model_selection import train_test_split
from scripts.evaluate_model import evaluate_model_fixed_split


# Fixing routing issue
project_root = os.path.abspath('..')
sys.path.append(project_root)

# Loads data
df = pd.read_csv("../data/processed/golden_intent_labeled.csv")

## Split Data

In [30]:
X = df['cleaned_text']
Y = df['intent']

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

X_train = X_train.fillna("")
X_test = X_test.fillna("")


## Model Creation and Evaluation

In [31]:
# Load data & fixed split (if not already done)
df = pd.read_csv("../data/processed/golden_intent_labeled.csv")
x = df['cleaned_text'].fillna('')
y = df['intent']
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=42
)

feature_config = [
    ('TF-IDF', []),
    ('TF-IDF + tweet_length', ['tweet_length']),
    ('TF-IDF + is_question', ['is_question']),
    ('TF-IDF + sentiment', ['sentiment_score']),
]

# Run eval per feature set and export separately
for label, features in feature_config:
    eval_df = evaluate_model_fixed_split(df, X_train, X_test, y_train, y_test, extra_feature_names=features)
    eval_df['feature_set'] = label
    # Save to individual file named after feature set
    safe_label = label.lower().replace(' ', '_').replace('+', 'plus').replace('-', '_')
    eval_df.to_csv(f"../data/features/eval_{safe_label}.csv", index=False)
    print(f"Saved: eval_{safe_label}.csv")

Saved: eval_tf_idf.csv
Saved: eval_tf_idf_plus_tweet_length.csv
Saved: eval_tf_idf_plus_is_question.csv
Saved: eval_tf_idf_plus_sentiment.csv
