# Model Analisis dan prediksi Pengeluaran dengan LSTM

In [111]:
!pip install tensorflow scikit-learn pandas matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import userdata
import os
import joblib



## Load Data

In [15]:
# Ambil kredensial dari Google Colab Secrets
kaggle_username = userdata.get('kaggle_username')  # Nama pengguna
kaggle_key = userdata.get('kaggle_key')     # Kunci API

# Set variabel lingkungan untuk Kaggle
os.environ['KAGGLE_USERNAME'] = kaggle_username
os.environ['KAGGLE_KEY'] = kaggle_key

In [16]:
!kaggle datasets download -d prasad22/daily-transactions-dataset
!unzip daily-transactions-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/prasad22/daily-transactions-dataset
License(s): other
daily-transactions-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  daily-transactions-dataset.zip
replace Daily Household Transactions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Daily Household Transactions.csv  


In [17]:
df = pd.read_csv("Daily Household Transactions.csv")
df.head()

Unnamed: 0,Date,Mode,Category,Subcategory,Note,Amount,Income/Expense,Currency
0,20/09/2018 12:04:08,Cash,Transportation,Train,2 Place 5 to Place 0,30.0,Expense,INR
1,20/09/2018 12:03:15,Cash,Food,snacks,Idli medu Vada mix 2 plates,60.0,Expense,INR
2,19/09/2018,Saving Bank account 1,subscription,Netflix,1 month subscription,199.0,Expense,INR
3,17/09/2018 23:41:17,Saving Bank account 1,subscription,Mobile Service Provider,Data booster pack,19.0,Expense,INR
4,16/09/2018 17:15:08,Cash,Festivals,Ganesh Pujan,Ganesh idol,251.0,Expense,INR


## Preprocessing

In [68]:
# Load and clean data
def load_and_clean(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
    df = df.dropna(subset=['date', 'amount', 'income/expense'])
    df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
    df = df[df['income/expense'].str.lower() == 'expense']
    if df['currency'].nunique() == 1:
        df = df.drop(columns='currency')
    df = df.sort_values('date').reset_index(drop=True)
    return df


In [71]:
# Agregate daily expense

def aggregate_daily_expense(df):
    df_daily = df.groupby(df['date'].dt.date)['amount'].sum().reset_index()
    df_daily.columns = ['date', 'total_expense']
    df_daily['date'] = pd.to_datetime(df_daily['date'])
    return df_daily

In [72]:
# Feature engineering

def normalize_series(series):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(series.values.reshape(-1, 1))
    return scaled.flatten(), scaler

def create_lstm_sequences(data, window_size=7):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

In [73]:
# Feature Engineering Tambahan
def add_additional_features(df_daily):
    df = df_daily.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    df['rolling_7d_mean'] = df['total_expense'].rolling(window=7, min_periods=1).mean()
    df['weekly_expense'] = df['total_expense'].rolling(window=7, min_periods=1).sum().shift(7)
    df['weekly_expense_change_pct'] = (
        (df['total_expense'].rolling(window=7, min_periods=1).sum() - df['weekly_expense'])
        / (df['weekly_expense'] + 1e-6)
    )
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df = df.dropna().reset_index(drop=True)
    return df

In [74]:
# Normalisasi & Buat Sequences untuk LSTM
def normalize_series(series):
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(series.values.reshape(-1, 1))
    return scaled.flatten(), scaler

def create_lstm_sequences(data, window_size=7):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

## Modelling LSTM

In [75]:
# Modelling Prediksi

def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

def train_lstm_model(X_train, y_train, X_val, y_val, epochs=50):
    model = build_lstm_model(input_shape=(X_train.shape[1], X_train.shape[2]))
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, y_train, validation_data=(X_val, y_val),
              epochs=epochs, callbacks=[early_stop], verbose=1)
    return model

In [107]:
# Generate Dataset Klasifikasi Boros/Hemat/Normal dengan Fitur Tambahan
def generate_classification_data(df_daily, y_pred_rescaled, y_actual_rescaled, threshold=0.35):
    """
    Menghasilkan DataFrame untuk training classifier.
    Masukkan fitur tambahan dari df_daily dan label berdasarkan threshold perbedaan prediksi dan aktual.
    """
    data = []
    for i in range(len(y_pred_rescaled)):
        pred = y_pred_rescaled[i]
        actual = y_actual_rescaled[i]
        diff = actual - pred
        pct = diff / pred if pred != 0 else 0

        if pct > threshold:
            label = 'Boros'
        elif pct < -threshold:
            label = 'Hemat'
        else:
            label = 'Normal'

        # Ambil fitur tambahan sesuai index offset window_size
        idx = i + 7  # offset window size, sesuaikan jika beda

        row = {
            'predicted_expense': pred,
            'actual_expense': actual,
            'diff': diff,
            'pct_diff': pct,
            'rolling_7d_mean': df_daily.loc[idx, 'rolling_7d_mean'],
            'weekly_expense': df_daily.loc[idx, 'weekly_expense'],
            'weekly_expense_change_pct': df_daily.loc[idx, 'weekly_expense_change_pct'],
            'weekday': df_daily.loc[idx, 'weekday'],
            'is_weekend': df_daily.loc[idx, 'is_weekend'],
            'label': label
        }
        data.append(row)

    df_cls = pd.DataFrame(data)
    return df_cls

In [108]:
# Modelling klasifikasi

def train_classifier(df_cls):
    feature_cols = ['predicted_expense', 'actual_expense', 'diff', 'pct_diff',
                    'rolling_7d_mean', 'weekly_expense', 'weekly_expense_change_pct',
                    'weekday', 'is_weekend']
    X = df_cls[feature_cols]
    y = df_cls['label']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)

    print("Classification Report:")
    print(classification_report(y_val, y_pred))

    return clf, X_val, y_val, y_pred

In [109]:
# Evaluasi Model Klasifikasi dengan Confusion Matrix dan Report
def evaluate_classification_model(y_true, y_pred, labels=None):
    print("Classification Model Evaluation:")
    print(classification_report(y_true, y_pred, labels=labels))
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    return cm

In [110]:
# Prediksi Minggu Depan & Bandingkan Tren Mingguan

from datetime import timedelta

def predict_next_week_expense(daily_df, model, scaler, window_size=7):
    recent_expense = daily_df['total_expense'].values[-window_size:]
    normalized_input = scaler.transform(recent_expense.reshape(-1, 1)).flatten()

    predictions = []
    current_input = normalized_input.copy()

    for _ in range(7):  # prediksi 7 hari
        input_seq = np.array(current_input[-window_size:]).reshape((1, window_size, 1))
        next_pred = model.predict(input_seq, verbose=0)[0][0]
        predictions.append(next_pred)
        current_input = np.append(current_input, next_pred)

    predictions_rescaled = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
    last_date = daily_df['date'].max()
    pred_dates = [last_date + timedelta(days=i+1) for i in range(7)]

    return pd.DataFrame({
        'date': pred_dates,
        'predicted_expense': predictions_rescaled
    })


def compare_weekly_trend(daily_df, pred_df, threshold=0.35):
    last_week_actual = daily_df.tail(7).reset_index(drop=True)
    this_week_pred = pred_df.reset_index(drop=True)

    actual_total = last_week_actual['total_expense'].sum()
    predicted_total = this_week_pred['predicted_expense'].sum()

    diff = predicted_total - actual_total
    pct_change = diff / actual_total if actual_total else 0

    if pct_change > threshold:
        summary = f"⚠️ Minggu ini diperkirakan boros ({pct_change:.1%} lebih tinggi dari minggu lalu)."
        label = 'Boros'
    elif pct_change < -threshold:
        summary = f"✅ Minggu ini diperkirakan lebih hemat ({abs(pct_change):.1%} lebih rendah dari minggu lalu)."
        label = 'Hemat'
    else:
        summary = f"🔄 Minggu ini diperkirakan normal (selisih {pct_change:.1%} dari minggu lalu)."
        label = 'Normal'

    breakdown = []
    for i in range(7):
        actual = last_week_actual['total_expense'].iloc[i] if i < len(last_week_actual) else 0
        pred = this_week_pred['predicted_expense'].iloc[i]
        change = (pred - actual) / actual if actual != 0 else 0
        hari = this_week_pred['date'].iloc[i].strftime("%A")
        sign = "lebih banyak" if change > 0 else "lebih hemat" if change < 0 else "sama"
        breakdown.append(f"{hari}: {abs(change):.1%} {sign} dari minggu lalu.")

    return summary, breakdown, {
        "last_week": last_week_actual,
        "predicted": this_week_pred,
        "label": label,
        "pct_change": pct_change
    }

# Eksekusi prediksi dan evaluasi tren
pred_df = predict_next_week_expense(daily_df, lstm_model, scaler)
summary, breakdown, trend_info = compare_weekly_trend(daily_df, pred_df)

# Tampilkan hasil
print(summary)
for line in breakdown:
    print("-", line)


⚠️ Minggu ini diperkirakan boros (282.9% lebih tinggi dari minggu lalu).
- Friday: 16.7% lebih hemat dari minggu lalu.
- Saturday: 732.3% lebih banyak dari minggu lalu.
- Sunday: 2657.9% lebih banyak dari minggu lalu.
- Monday: 571.3% lebih banyak dari minggu lalu.
- Tuesday: 421.7% lebih banyak dari minggu lalu.
- Wednesday: 6639.1% lebih banyak dari minggu lalu.
- Thursday: 1291.6% lebih banyak dari minggu lalu.


## Training

In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from datetime import timedelta

In [98]:
# Load & preprocessing
df = load_and_clean("Daily Household Transactions.csv")
daily_df = aggregate_daily_expense(df)
daily_df = add_additional_features(daily_df)

# Normalisasi dan buat sequence untuk LSTM (gunakan 'total_expense')
normalized, scaler = normalize_series(daily_df['total_expense'])
daily_df['normalized_expense'] = normalized  # <- ini penting untuk input ke LSTM


In [99]:
# Train-Test Split untuk LSTM
X_lstm, y_lstm = create_lstm_sequences(daily_df['normalized_expense'].values, window_size=7)
X_lstm = X_lstm.reshape((X_lstm.shape[0], X_lstm.shape[1], 1))
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42)


In [100]:
# Train LSTM
lstm_model = Sequential([
    LSTM(64, return_sequences=False, input_shape=(X_lstm.shape[1], 1)),
    Dropout(0.2),
    Dense(1)
])
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=30, batch_size=16, validation_data=(X_test_lstm, y_test_lstm), verbose=0)

# LSTM Evaluation
y_pred_lstm = lstm_model.predict(X_test_lstm).flatten()
y_test_inverse = scaler.inverse_transform(y_test_lstm.reshape(-1, 1)).flatten()
y_pred_inverse = scaler.inverse_transform(y_pred_lstm.reshape(-1, 1)).flatten()

  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


In [104]:
# Evaluasi
print("📈 Evaluation: LSTM Regression Model")
print("MAE :", mean_absolute_error(y_test_inverse, y_pred_inverse))
print("MSE :", mean_squared_error(y_test_inverse, y_pred_inverse))
print("RMSE:", np.sqrt(mean_squared_error(y_test_inverse, y_pred_inverse)))  # Ganti baris ini
print("R2  :", r2_score(y_test_inverse, y_pred_inverse))

📈 Evaluation: LSTM Regression Model
MAE : 1526.7351275651351
MSE : 16634165.958368512
RMSE: 4078.500454624041
R2  : 0.004216586152048896


In [105]:
# Prediksi & Evaluasi Tren Minggu Depan
pred_week_df = predict_next_week_expense(daily_df, lstm_model, scaler)
summary_text, breakdown_list, result_dict = compare_weekly_trend(daily_df, pred_week_df)

print("\n📊 Prediksi Minggu Depan:")
print(summary_text)
for line in breakdown_list:
    print("•", line)



📊 Prediksi Minggu Depan:
⚠️ Minggu ini diperkirakan boros (282.9% lebih tinggi dari minggu lalu).
• Friday: 16.7% lebih hemat dari minggu lalu.
• Saturday: 732.3% lebih banyak dari minggu lalu.
• Sunday: 2657.9% lebih banyak dari minggu lalu.
• Monday: 571.3% lebih banyak dari minggu lalu.
• Tuesday: 421.7% lebih banyak dari minggu lalu.
• Wednesday: 6639.1% lebih banyak dari minggu lalu.
• Thursday: 1291.6% lebih banyak dari minggu lalu.


# Save Model

In [118]:
model_dict = {
    'regression': lstm_model,
    'classification': clf,
    'scaler': scaler
}

joblib.dump(model_dict, 'model_day_LSTM.pkl')
print("Model disimpan ke file model_day.pkl")

Model disimpan ke file model_day.pkl


In [119]:
# Load file model_day.pkl
model_dict = joblib.load('model_day_LSTM.pkl')

# Cek keys
print("Model keys di dalam model_day_LSTM.pkl:", model_dict.keys())

# Cek tipe tiap model
for key, model in model_dict.items():
    print(f"Model '{key}' bertipe: {type(model)}")

Model keys di dalam model_day_LSTM.pkl: dict_keys(['regression', 'classification', 'scaler'])
Model 'regression' bertipe: <class 'keras.src.models.sequential.Sequential'>
Model 'classification' bertipe: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Model 'scaler' bertipe: <class 'sklearn.preprocessing._data.MinMaxScaler'>


In [120]:
from google.colab import files

files.download('model_day_LSTM.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>