# Building the Model

## Imports

In [1]:
import pandas as pd
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# Fixing routing issue
project_root = os.path.abspath('..')
sys.path.append(project_root)

## Load and Prep Data

In [None]:
df = pd.read_csv("../data/processed/golden_intent_labeled.csv")

# Quick peep
df[['cleaned_text', 'intent']].head()

Unnamed: 0,cleaned_text,intent
0,whats that egg website people talk about,Other
1,why ios,Other
2,we can assist you we recommend updating to io...,Other
3,thats better than having an unstable connecti...,Other
4,is probably one of the best airlines ive ever...,Other


## Split Data

In [3]:
X = df['cleaned_text']
Y = df['intent']

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

X_train = X_train.fillna("")
X_test = X_test.fillna("")


## Model Creation and Evaluation

In [6]:
from scripts.evaluate_model import evaluate_model

# Baseline
evaluate_model(df)

# With tweet length
evaluate_model(df, extra_feature_names=['tweet_length'])

# With question flag
evaluate_model(df, extra_feature_names=['is_question'])

# With sentiment
evaluate_model(df, extra_feature_names=['sentiment_score'])



=== Evaluation with features ['TF-IDF'] ===

                    precision    recall  f1-score   support

Account Management       0.93      0.69      0.79        94
           Billing       1.00      0.30      0.46        73
         Complaint       1.00      0.18      0.31        33
             Other       0.88      1.00      0.93      1369
  Praise/Thank You       0.95      0.81      0.87       258
 Technical Support       1.00      0.67      0.80       155

          accuracy                           0.90      1982
         macro avg       0.96      0.61      0.70      1982
      weighted avg       0.91      0.90      0.88      1982


=== Evaluation with features ['TF-IDF', 'tweet_length'] ===

                    precision    recall  f1-score   support

Account Management       0.93      0.72      0.81        94
           Billing       1.00      0.27      0.43        73
         Complaint       1.00      0.18      0.31        33
             Other       0.88      1.00      0.9

(4282      Praise/Thank You
 7615                 Other
 9703                 Other
 2955      Praise/Thank You
 7372      Praise/Thank You
                ...        
 3559     Technical Support
 4822      Praise/Thank You
 1732     Technical Support
 4079    Account Management
 4636                 Other
 Name: intent, Length: 1982, dtype: object,
 4484     Technical Support
 9102                 Other
 8115    Account Management
 3465                 Other
 1353                 Other
                ...        
 1849                 Other
 3366      Praise/Thank You
 6089      Praise/Thank You
 3940                 Other
 9806                 Other
 Name: intent, Length: 7928, dtype: object)