In [14]:
import numpy as np 
import pandas as pd

In [17]:
df = pd.read_csv("Dataset/intent_dataset_ml_diverse.csv")
df.describe()

Unnamed: 0,sentence,intent
count,4800,4800
unique,2021,8
top,ensure dataset has no duplicate rows,REMOVE_DUPLICATES_ROWS
freq,54,600


In [4]:
df.intent.value_counts()

intent
REMOVE_DUPLICATES_ROWS    600
FILL_NA                   600
LABEL_ENCODE_COLUMN       600
STANDARDIZE_COLUMN        600
TYPE_CAST                 600
DROP_COLUMN               600
NORMALIZE_COLUMN          600
REMOVE_OUTLIERS_COLUMN    600
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

X = df["sentence"]
y = df["intent"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(label_encoder.classes_)


['DROP_COLUMN' 'FILL_NA' 'LABEL_ENCODE_COLUMN' 'NORMALIZE_COLUMN'
 'REMOVE_DUPLICATES_ROWS' 'REMOVE_OUTLIERS_COLUMN' 'STANDARDIZE_COLUMN'
 'TYPE_CAST']


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# =========================
# 2️⃣ Load / Inspect Data
# =========================
# Example:
# df = pd.read_csv("your_data.csv")
# Columns: 'sentence', 'intent'


df = df.dropna(subset=["sentence", "intent"])
df = df[df["sentence"].str.strip() != ""]
df = df.reset_index(drop=True)


print(df['intent'].value_counts())

X = df["sentence"]
y = df["intent"]


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("Classes:", label_encoder.classes_)


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)


X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF train shape:", X_train_tfidf.shape)
print("TF-IDF test shape:", X_test_tfidf.shape)


clf = LogisticRegression(
    C=1.0,
    max_iter=1000,
    class_weight="balanced",
    solver="liblinear"
)

clf.fit(X_train_tfidf, y_train)


y_pred = clf.predict(X_test_tfidf)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Optional confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)


def inspect_sample(i, X, X_tfidf, y, vectorizer):
    """
    Inspect a single sample in train/test:
    - Original sentence
    - Encoded label
    - TF-IDF tokens with weights
    """
    print("Sentence:\n", X.iloc[i])
    print("Label (encoded):", y[i])
    print("Label (decoded):", label_encoder.inverse_transform([y[i]])[0])

    row = X_tfidf[i]
    feature_names = vectorizer.get_feature_names_out()
    indices = row.indices
    values = row.data

    print("\nTF-IDF tokens and weights:")
    for token, value in zip([feature_names[idx] for idx in indices], values):
        print(token, ":", round(value, 4))

# Example: inspect row 1
inspect_sample(1, X_train, X_train_tfidf, y_train, vectorizer)


intent
REMOVE_DUPLICATES_ROWS    600
FILL_NA                   600
LABEL_ENCODE_COLUMN       600
STANDARDIZE_COLUMN        600
TYPE_CAST                 600
DROP_COLUMN               600
NORMALIZE_COLUMN          600
REMOVE_OUTLIERS_COLUMN    600
Name: count, dtype: int64
Classes: ['DROP_COLUMN' 'FILL_NA' 'LABEL_ENCODE_COLUMN' 'NORMALIZE_COLUMN'
 'REMOVE_DUPLICATES_ROWS' 'REMOVE_OUTLIERS_COLUMN' 'STANDARDIZE_COLUMN'
 'TYPE_CAST']
TF-IDF train shape: (3840, 807)
TF-IDF test shape: (960, 807)

Classification Report:

                        precision    recall  f1-score   support

           DROP_COLUMN       1.00      1.00      1.00       120
               FILL_NA       1.00      1.00      1.00       120
   LABEL_ENCODE_COLUMN       1.00      1.00      1.00       120
      NORMALIZE_COLUMN       1.00      1.00      1.00       120
REMOVE_DUPLICATES_ROWS       1.00      1.00      1.00       120
REMOVE_OUTLIERS_COLUMN       1.00      1.00      1.00       120
    STANDARDIZE_COLUMN       1

In [9]:
import joblib

# Save vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Save trained classifier
joblib.dump(clf, "logistic_classifier.pkl")

# Save label encoder
joblib.dump(label_encoder, "label_encoder.pkl")


['label_encoder.pkl']

In [10]:
# Load saved objects
import joblib

vectorizer = joblib.load("tfidf_vectorizer.pkl")
clf = joblib.load("logistic_classifier.pkl")
label_encoder = joblib.load("label_encoder.pkl")


In [11]:
new_sentences = ["fillna with mean"]

# Transform new sentences
X_new_tfidf = vectorizer.transform(new_sentences)

# Predict encoded labels
y_pred_encoded = clf.predict(X_new_tfidf)
y_pred_labels = label_encoder.inverse_transform(y_pred_encoded)

# Predict probabilities
y_pred_proba = clf.predict_proba(X_new_tfidf)

for sentence, label, proba, pred_encoded in zip(new_sentences, y_pred_labels, y_pred_proba, y_pred_encoded):
    print(f"Sentence: {sentence}")
    print(f"Predicted Intent: {label}")
    
    # Probability of the guessed intent only
    predicted_prob = round(proba[pred_encoded], 3)
    print(f"Probability of predicted intent: {predicted_prob}")
    print("------")




Sentence: fillna with mean
Predicted Intent: FILL_NA
Probability of predicted intent: 0.764
------
