# My Finance - Category Classification Model
## TF-IDF + SVM for Vietnamese Transaction Text

**Cách upload file `training_data.json`:**
1. Upload lên Google Drive
2. Chạy cell mount Drive bên dưới
3. Hoặc dùng URL nếu file đã public

In [None]:
# Install dependencies
!pip install scikit-learn pandas matplotlib seaborn

In [None]:
# === OPTION 1: Mount Google Drive (Recommended) ===
from google.colab import drive
drive.mount('/content/drive')

# Copy file từ Drive vào Colab
# Thay đổi path phù hợp với vị trí file của bạn trên Drive
!cp "/content/drive/My Drive/training_data.json" .

# === OPTION 2: Direct upload (có thể lỗi với file lớn) ===
# from google.colab import files
# uploaded = files.upload()

In [None]:
import json
import re
import unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Text preprocessing for Vietnamese
TEENCODE_MAP = {
    "k": "không", "ko": "không", "k0": "không",
    "dc": "được", "đc": "được",
    "vs": "với", "j": "gì", "z": "vậy", "r": "rồi",
    "cf": "cafe", "coffe": "coffee", "cofee": "coffee",
}

TYPO_MAP = {
    "grap": "grab", "grabs": "grab",
    "shoppee": "shopee", "lazda": "lazada",
    "hoá đơn": "hóa đơn", "cà fê": "cà phê",
    "ca phe": "cà phê", "tra sua": "trà sữa",
}

def preprocess_text(text):
    if not text:
        return ""
    
    # Normalize unicode
    text = unicodedata.normalize("NFC", text)
    text = text.lower()
    
    # Fix typos
    for typo, fix in TYPO_MAP.items():
        text = re.sub(re.escape(typo), fix, text, flags=re.IGNORECASE)
    
    # Fix teencode
    words = text.split()
    words = [TEENCODE_MAP.get(w, w) for w in words]
    text = " ".join(words)
    
    # Remove special chars but keep Vietnamese
    text = re.sub(r"[^\w\sàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

print(preprocess_text("Ăn phở 50k"))
print(preprocess_text("đi grap 30k"))
print(preprocess_text("cf vs bạn"))

In [None]:
# Load training data
with open('training_data.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

print(f"Total records: {len(raw_data)}")
print(f"Sample: {raw_data[0]}")

In [None]:
# Parse data
CATEGORIES = [
    "income", "food", "transportation", "entertainment", "shopping",
    "health", "education", "utilities", "home", "personal",
    "travel", "investment", "family", "houseware", "donation", "charity", "other"
]

texts = []
labels = []

for record in raw_data:
    text = record.get('text', '')
    category = record.get('correctedCategory', record.get('category', ''))
    
    if text and category and category in CATEGORIES:
        texts.append(preprocess_text(text))
        labels.append(category)

print(f"Valid samples: {len(texts)}")

# Category distribution
category_counts = Counter(labels)
print("\nCategory distribution:")
for cat, count in category_counts.most_common():
    print(f"  {cat}: {count}")

In [None]:
# === DATASET SUMMARY TABLE ===

df = pd.DataFrame({'text': texts, 'category': labels})
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

# Summary table
summary = df.groupby('category').agg(
    samples=('text', 'count'),
    avg_words=('word_count', 'mean'),
    min_words=('word_count', 'min'),
    max_words=('word_count', 'max')
).round(1)

summary['percent'] = (summary['samples'] / len(df) * 100).round(1)
summary = summary[['samples', 'percent', 'avg_words', 'min_words', 'max_words']]
summary.columns = ['Samples', '%', 'Avg Words', 'Min', 'Max']
summary = summary.sort_values('Samples', ascending=False)

print(f"Total: {len(df)} samples, {len(category_counts)} categories\n")
print(summary.to_string())

In [None]:
# Data Analysis - Kiểm tra imbalanced data
print("=== DATA QUALITY CHECK ===\n")

# 1. Check imbalance ratio
max_count = max(category_counts.values())
min_count = min(category_counts.values())
print(f"Imbalance ratio: {max_count/min_count:.1f}x")
print(f"Max category: {max(category_counts, key=category_counts.get)} ({max_count})")
print(f"Min category: {min(category_counts, key=category_counts.get)} ({min_count})")

# 2. Check text length distribution
text_lengths = [len(t.split()) for t in texts]
print(f"\nText length (words): mean={np.mean(text_lengths):.1f}, min={min(text_lengths)}, max={max(text_lengths)}")

# 3. Categories cần thêm data
print("\nCategories cần thêm data (< 500 samples):")
for cat, count in category_counts.items():
    if count < 500:
        print(f"  {cat}: {count} (cần thêm ~{500-count})")

In [None]:
# TF-IDF Vectorization
TFIDF_PARAMS = {
    "max_features": 5000,
    "ngram_range": (1, 3),
    "min_df": 2,
    "max_df": 0.95,
    "sublinear_tf": True,
}

vectorizer = TfidfVectorizer(**TFIDF_PARAMS)
X = vectorizer.fit_transform(texts)

print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {len(vectorizer.get_feature_names_out())}")

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

print(f"Classes: {label_encoder.classes_}")

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")

In [None]:
# So sánh nhiều models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

models = {
    'SVM Linear': SVC(kernel='linear', probability=True, class_weight='balanced'),
    'SVM RBF': SVC(kernel='rbf', probability=True, class_weight='balanced'),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1),
}

print("Comparing models with 5-fold CV:\n")
results = []

for name, clf in models.items():
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_weighted')
    results.append({
        'Model': name,
        'Mean F1': scores.mean(),
        'Std': scores.std()
    })
    print(f"{name:20} F1: {scores.mean():.2%} (+/- {scores.std():.2%})")

# Visualize
results_df = pd.DataFrame(results).sort_values('Mean F1', ascending=True)
plt.figure(figsize=(10, 5))
plt.barh(results_df['Model'], results_df['Mean F1'], xerr=results_df['Std'], color='steelblue')
plt.xlabel('F1 Score (weighted)')
plt.title('Model Comparison')
plt.xlim(0, 1)
plt.tight_layout()
plt.show()

In [None]:
# Hyperparameter Tuning với GridSearchCV
from sklearn.model_selection import GridSearchCV

# Tham số cần tìm kiếm
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],  # Chỉ dùng cho rbf
}

# GridSearch với cross-validation 5-fold
grid_search = GridSearchCV(
    SVC(probability=True, class_weight='balanced'),
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

print("Searching best parameters...")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.2%}")

# Dùng model tốt nhất
model = grid_search.best_estimator_

In [None]:
# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-validation: {cv_scores.mean():.2%} (+/- {cv_scores.std():.2%})")

In [None]:
# Confusion Matrix
plt.figure(figsize=(14, 12))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Test predictions
def predict(text):
    processed = preprocess_text(text)
    X_new = vectorizer.transform([processed])
    proba = model.predict_proba(X_new)[0]
    top_idx = np.argsort(proba)[::-1][:3]
    
    print(f"Input: {text}")
    print(f"Preprocessed: {processed}")
    for idx in top_idx:
        cat = label_encoder.inverse_transform([idx])[0]
        conf = proba[idx]
        print(f"  {cat}: {conf:.2%}")
    print()

# Test
predict("ăn phở sáng 50k")
predict("đi grab về nhà")
predict("mua quần áo shopee")
predict("tiền điện tháng 12")
predict("lương tháng 1")
predict("cà phê với bạn")

In [None]:
# === EXPORT FOR THESIS/REPORT ===

print("=" * 60)
print("THÔNG TIN CHO ĐỒ ÁN")
print("=" * 60)

# 1. Dataset Overview
print("\n1. TỔNG QUAN DATASET")
print("-" * 40)
print(f"   Tổng số mẫu:     {len(df):,}")
print(f"   Số categories:   {len(category_counts)}")
print(f"   Ngôn ngữ:        Tiếng Việt")

# 2. Category Distribution Table
print("\n2. PHÂN BỐ DỮ LIỆU THEO CATEGORY")
print("-" * 40)
print(summary.to_string())

# 3. Preprocessing
print("\n3. TIỀN XỬ LÝ VĂN BẢN")
print("-" * 40)
print("   - Unicode normalization (NFC)")
print("   - Chuyển lowercase")
print("   - Chuẩn hóa teencode (k→không, dc→được, cf→cafe, ...)")
print("   - Sửa lỗi chính tả (grap→grab, shoppee→shopee, ...)")
print("   - Loại bỏ ký tự đặc biệt, giữ tiếng Việt")

# 4. Feature Extraction
print("\n4. TRÍCH XUẤT ĐẶC TRƯNG (TF-IDF)")
print("-" * 40)
print(f"   N-gram range:    (1, 3)")
print(f"   Max features:    5,000")
print(f"   Min document freq: 2")
print(f"   Max document freq: 95%")
print(f"   Sublinear TF:    True")
print(f"   Số features thực tế: {X.shape[1]:,}")

# 5. Model Info
print("\n5. MÔ HÌNH PHÂN LOẠI")
print("-" * 40)
print(f"   Algorithm:       {type(model).__name__}")
if hasattr(model, 'kernel'):
    print(f"   Kernel:          {model.kernel}")
if hasattr(model, 'C'):
    print(f"   C:               {model.C}")
print(f"   Class weight:    balanced")

# 6. Results
print("\n6. KẾT QUẢ")
print("-" * 40)
print(f"   Train/Test split: 80/20")
print(f"   Accuracy:        {accuracy:.2%}")
print(f"   Cross-validation: {cv_scores.mean():.2%} (+/- {cv_scores.std():.2%})")

# 7. Classification Report
print("\n7. CLASSIFICATION REPORT")
print("-" * 40)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("=" * 60)

In [None]:
# Top features per category
feature_names = vectorizer.get_feature_names_out()

print("Top 10 features per category:")
for i, category in enumerate(label_encoder.classes_):
    # Get indices of samples in this category
    cat_mask = (y == i)
    if cat_mask.sum() == 0:
        continue
    
    # Mean TF-IDF for this category
    cat_tfidf = X[cat_mask].mean(axis=0).A1
    top_indices = cat_tfidf.argsort()[::-1][:10]
    top_features = [(feature_names[idx], cat_tfidf[idx]) for idx in top_indices]
    
    print(f"\n{category}:")
    for feat, score in top_features:
        print(f"  {feat}: {score:.4f}")

In [None]:
# Save model (optional - download to local)
import joblib

joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(model, 'svm_classifier.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')

# Download
files.download('tfidf_vectorizer.joblib')
files.download('svm_classifier.joblib')
files.download('label_encoder.joblib')