In [2]:
# ==========================================
# BAGIAN 1: IMPORT LIBRARY & LOAD DATA
# ==========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
import json
import warnings

warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# ==========================================
# BAGIAN 2: DATA PREPARATION
# ==========================================

# 1. Load Data
# Note: Using the provided filename "Data Sampel 22 Januari.csv"
file_path = 'Data Sampel 22 Januari.csv'
df = pd.read_csv(file_path, delimiter=';')

print(f"Total Rows Raw Data: {len(df)}")

# 2. Filter Data
# CloseLost must be 'NO' and statusLayanan must not be 'TIDAK AKTIF'
df_filtered = df[
    (df['CloseLost'] == 'NO') & 
    (df['statusLayanan'] != 'TIDAK AKTIF')
].copy()

print(f"Rows after filtering: {len(df_filtered)}")

# 3. Convert Date Column
df_filtered['tanggalBuatPermohonan'] = pd.to_datetime(df_filtered['tanggalBuatPermohonan'])

# 4. Sort Data
# Vital for history flow
df_sorted = df_filtered.sort_values(by=['idPerusahaan', 'tanggalBuatPermohonan'])

# Select relevant columns
data = df_sorted[['idPerusahaan', 'tanggalBuatPermohonan', 'namaProduk']]

display(data.head())
print(f"Number of unique customers: {data['idPerusahaan'].nunique()}")
print(f"Number of unique products: {data['namaProduk'].nunique()}")

In [None]:
# ==========================================
# BAGIAN 3: SEQUENCE GENERATION
# ==========================================

# 1. Group by Customer and get list of products in chronological order
customer_history = data.groupby('idPerusahaan')['namaProduk'].apply(list).reset_index()

# Filter out customers with only 1 purchase (cannot train sequence)
customer_history = customer_history[customer_history['namaProduk'].apply(len) > 1]
print(f"Customers with > 1 purchase: {len(customer_history)}")

# 2. Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(customer_history['namaProduk'])
total_products = len(tokenizer.word_index) + 1  # 1 for padding

print(f"Total Unique Products (Vocab Size): {total_products}")

# Convert product names to sequences of integers
sequences = tokenizer.texts_to_sequences(customer_history['namaProduk'])

# 3. Create N-Gram Sequences
# For [A, B, C] -> Input: [A], Label: B; Input: [A, B], Label: C
input_sequences = []
for seq in sequences:
    for i in range(1, len(seq)):
        n_gram_sequence = seq[:i+1]
        input_sequences.append(n_gram_sequence)

print(f"Total Input Sequences generated: {len(input_sequences)}")
print("Example sequences (tokens):", input_sequences[:5])

In [None]:
# ==========================================
# BAGIAN 4: PADDING & SPLITTING
# ==========================================

# 1. Padding
max_sequence_len = max([len(x) for x in input_sequences])
print(f"Max Sequence Length: {max_sequence_len}")

padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# 2. Split X and y
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# 3. Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

In [None]:
# ==========================================
# BAGIAN 5: MODEL ARCHITECTURE
# ==========================================

model = Sequential()

# Embedding Layer
# input_dim needs to be total_products
# input_length is max_sequence_len - 1 (because we took last token as label)
model.add(Embedding(input_dim=total_products, 
                    output_dim=64, 
                    input_length=max_sequence_len - 1))

# LSTM Layer
model.add(LSTM(100, dropout=0.2))

# Output Layer
model.add(Dense(total_products, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()

In [None]:
# ==========================================
# BAGIAN 6: TRAINING
# ==========================================

history = model.fit(X_train, y_train, 
                    epochs=50, 
                    validation_data=(X_val, y_val), 
                    verbose=1)

print("Training Completed.")

In [None]:
# ==========================================
# BAGIAN 7: VISUALIZATION
# ==========================================

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'b', label='Training acc')
    plt.plot(epochs, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()

plot_history(history)

In [None]:
# ==========================================
# BAGIAN 8: INFERENCE FUNCTION & SAVING
# ==========================================
import json

# Save Tokenizer logic
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))
print("Tokenizer saved as tokenizer.json")

# Save Model (Optional but good practice)
model.save("lstm_product_recommendation.h5")
print("Model saved as lstm_product_recommendation.h5")

def recommend_next_product(customer_id, top_k=3):
    """
    Predicts the next top_k most likely products for a given customer_id.
    """
    # 1. Get Customer History
    customer_data = df_filtered[df_filtered['idPerusahaan'] == customer_id].sort_values('tanggalBuatPermohonan')
    
    if len(customer_data) == 0:
        return "Customer ID not found or has no valid purchase history."
    
    product_history = customer_data['namaProduk'].tolist()
    
    # 2. Prepare Sequence
    token_list = tokenizer.texts_to_sequences([product_history])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    # 3. Predict
    predicted_probs = model.predict(token_list, verbose=0)[0]
    
    # Get Top K indices
    top_indices = predicted_probs.argsort()[-top_k:][::-1]
    
    recommendations = []
    for idx in top_indices:
        # Decode index to word (product name)
        # tokenizer.index_word contains the mapping from integer to string
        if idx in tokenizer.index_word:
            product_name = tokenizer.index_word[idx]
            probability = predicted_probs[idx]
            recommendations.append((product_name, probability))
            
    return {
        "customer_id": customer_id,
        "history_length": len(product_history),
        "last_purchases": product_history[-5:], # Show last 5
        "recommendations": recommendations
    }

print("Inference function ready.")

In [None]:
# ==========================================
# BAGIAN 9: TEST PREDICTION
# ==========================================

# Pick a random customer from the history
sample_customer_id = customer_history['idPerusahaan'].sample(1).values[0]

print(f"Testing recommendation for Customer ID: {sample_customer_id}")
result = recommend_next_product(sample_customer_id)

print("\n--- Recommendation Result ---")
print(f"Customer ID: {result['customer_id']}")
print(f"History (Last 5): {result['last_purchases']}")
print("\nPredicted Next Best Products:")
for i, (prod, prob) in enumerate(result['recommendations'], 1):
    print(f"{i}. {prod} (Confidence: {prob:.2%})")