In [44]:
# Import Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

In [45]:
# Import Data
data = pd.read_csv("data/online_retail_II.csv")

In [39]:
# Data cleaning
# Trash Types to clean 
trash_codes = [
    'POST', 'D', 'M', 'C2', 'BANK CHARGES', 
    'TEST001', 'TEST002', 'PADS', 'ADJUST', 
    'ADJUST2', 'SP1002', 'DOT', 'CRUK'
]
# fixing data"
data = data[~data['StockCode'].isin(trash_codes)]
# Change datatype to dataframe for Data
data["InvoiceDate"] = pd.to_datetime(data["InvoiceDate"])
# Drop null Customers ID
data = data.dropna(subset=['Customer ID'])
# Returns
data['IsReturn'] = data['Invoice'].astype(str).str.startswith('C')
# Separate value for Purchase and Return Amount
data['PurchaseAmount'] = np.where(data['IsReturn'] == False, data['Quantity'] * data['Price'], 0)
data['ReturnAmount']   = np.where(data['IsReturn'] == True,  (data['Quantity'] * data['Price']).abs(), 0)
# Net Revenue
data['NetRevenue'] = data['PurchaseAmount'] - data['ReturnAmount']
# snapshot date
snapshot_date = data['InvoiceDate'].max() + pd.Timedelta(days=1)
# create values
rfm = data.groupby('Customer ID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days, # Recency
    'Invoice': 'nunique',                                   # Frequency
    'NetRevenue': 'sum'                                     # Monetary
})
rfm.columns = ['Recency', 'Frequency', 'Monetary']
# Last 90 days and last 90 days before previous 90 days
cutoff_180 = snapshot_date - pd.Timedelta(days=180)
cutoff_360 = cutoff_180 - pd.Timedelta(days=180)
data_recent = data[data['InvoiceDate'] >= cutoff_180]
data_prev = data[(data['InvoiceDate'] < cutoff_180) & (data['InvoiceDate'] >= cutoff_360)]
# Calulating metrics for 2 dates
recent_metrics = data_recent.groupby('Customer ID').agg({'Invoice': 'nunique', 'NetRevenue': 'sum'})
prev_metrics = data_prev.groupby('Customer ID').agg({'Invoice': 'nunique', 'NetRevenue': 'sum'})
recent_metrics.columns = ['Recent_Frequency', 'Recent_Monetary']
prev_metrics.columns = ['Prev_Frequency', 'Prev_Monetary']
rfm = rfm.join(recent_metrics).join(prev_metrics).fillna(0)
# Creating Delta Values
rfm["Frequency_delta"] = (rfm["Recent_Frequency"] + 1) / (rfm["Prev_Frequency"] + 1)
rfm["Monetary_delta"] = (rfm["Recent_Monetary"] + 1) / (rfm["Prev_Monetary"] + 1)
# Creating Churn Metric
rfm["Churn"] = rfm["Recency"].apply(lambda x: 1 if x > 90 else 0)
# Check Groups
print("(0 = Left, 1 = Stayed):")
print(rfm["Churn"].value_counts())

(0 = Left, 1 = Stayed):
Churn
1    2962
0    2914
Name: count, dtype: int64


In [40]:
# 1. Define X (features) and y (target)
# Remove "Recency" to avoid Data Leakage 
# Same with Churn
X = rfm.drop(['Churn', 'Recency'], axis=1)
y = rfm['Churn']

# 2. 80% Train 20% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("--- SUMMARY ---")
print(f"All Clients: {len(rfm)}")
print(f"Train: {len(X_train)}")
print(f"Test: {len(X_test)}")
print("\nFeatures (X):")
print(list(X.columns))

--- SUMMARY ---
All Clients: 5876
Train: 4700
Test: 1176

Features (X):
['Frequency', 'Monetary', 'Recent_Frequency', 'Recent_Monetary', 'Prev_Frequency', 'Prev_Monetary', 'Frequency_delta', 'Monetary_delta']


In [42]:
# --- STEP 1: Initialize the Model ---
# n_estimators=100 -> We create a forest of 100 decision trees (voters)
# max_depth=6      -> Limit tree depth to prevent overfitting (memorizing data)
# random_state=42  -> Ensures reproducible results
rf_model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)

# --- STEP 2: Train the Model (Fit) ---
# The model learns patterns from X_train to predict y_train
print("Training the Random Forest model... Please wait.")
rf_model.fit(X_train, y_train)

# --- STEP 3: Make Predictions ---
# We ask the model to predict Churn for the Test Set (unseen data)
y_pred = rf_model.predict(X_test)

# --- STEP 4: Evaluate Performance ---
# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)

print("-" * 30)
print(f"Model Accuracy: {accuracy:.2%}") # Display as percentage
print("(0 = Left, 1 = Stayed)")
print("-" * 30)
print("Detailed Classification Report:")
print(classification_report(y_test, y_pred))

Training the Random Forest model... Please wait.
------------------------------
Model Accuracy: 89.03%
(0 = Left, 1 = Stayed)
------------------------------
Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.99      0.90       580
           1       0.99      0.79      0.88       596

    accuracy                           0.89      1176
   macro avg       0.91      0.89      0.89      1176
weighted avg       0.91      0.89      0.89      1176



In [43]:
# Importances of features
importances = rf_model.feature_importances_
feature_names = X.columns

# Create Table
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Create Plot
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Importance (0-1)')
plt.ylabel('Feature Name')
plt.title('Co decyduje o odej≈õciu klienta? (Feature Importance)')
plt.gca().invert_yaxis()
plt.show()

print(feature_importance_df)

NameError: name 'plt' is not defined

In [46]:
install matplotlib

SyntaxError: invalid syntax (3255529784.py, line 1)