In [33]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix 


In [5]:
data_df = pd.read_csv('Dataset_CSV/online_sales_dataset.csv')

In [6]:
data_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,PaymentMethod,ShippingCost,Category,SalesChannel,ReturnStatus,ShipmentProvider,WarehouseLocation,OrderPriority
0,221958,SKU_1964,White Mug,38,2020-01-01 00:00,1.71,37039.0,Australia,0.47,Bank Transfer,10.79,Apparel,In-store,Not Returned,UPS,London,Medium
1,771155,SKU_1241,White Mug,18,2020-01-01 01:00,41.25,19144.0,Spain,0.19,paypall,9.51,Electronics,Online,Not Returned,UPS,Rome,Medium
2,231932,SKU_1501,Headphones,49,2020-01-01 02:00,29.11,50472.0,Germany,0.35,Bank Transfer,23.03,Electronics,Online,Returned,UPS,Berlin,High
3,465838,SKU_1760,Desk Lamp,14,2020-01-01 03:00,76.68,96586.0,Netherlands,0.14,paypall,11.08,Accessories,Online,Not Returned,Royal Mail,Rome,Low
4,359178,SKU_1386,USB Cable,-30,2020-01-01 04:00,-68.11,,United Kingdom,1.501433,Bank Transfer,,Electronics,In-store,Not Returned,FedEx,,Medium


In [7]:
data_df['InvoiceDate'] = pd.to_datetime(data_df['InvoiceDate'])

In [12]:
data_df['TotalSales'] = data_df['Quantity'] * data_df['UnitPrice'] * (1 - data_df['Discount'])

In [13]:
last_purchase_date = data_df.groupby('CustomerID')['InvoiceDate'].max().reset_index()
last_purchase_date.columns = ['CustomerID', 'LastPurchaseDate']

In [14]:
last_purchase_date

Unnamed: 0,CustomerID,LastPurchaseDate
0,10001.0,2023-08-26 03:00:00
1,10003.0,2020-08-20 18:00:00
2,10005.0,2024-12-29 01:00:00
3,10008.0,2025-04-13 09:00:00
4,10009.0,2020-09-06 03:00:00
...,...,...
35384,99986.0,2023-09-11 06:00:00
35385,99989.0,2025-08-07 01:00:00
35386,99993.0,2024-01-19 04:00:00
35387,99997.0,2021-11-01 04:00:00


In [15]:
data_df = pd.merge(data_df, last_purchase_date, on='CustomerID', how='left')

In [16]:
data_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,PaymentMethod,ShippingCost,Category,SalesChannel,ReturnStatus,ShipmentProvider,WarehouseLocation,OrderPriority,TotalSales,LastPurchaseDate
0,221958,SKU_1964,White Mug,38,2020-01-01 00:00:00,1.71,37039.0,Australia,0.47,Bank Transfer,10.79,Apparel,In-store,Not Returned,UPS,London,Medium,34.4394,2020-01-01 00:00:00
1,771155,SKU_1241,White Mug,18,2020-01-01 01:00:00,41.25,19144.0,Spain,0.19,paypall,9.51,Electronics,Online,Not Returned,UPS,Rome,Medium,601.425,2020-01-01 01:00:00
2,231932,SKU_1501,Headphones,49,2020-01-01 02:00:00,29.11,50472.0,Germany,0.35,Bank Transfer,23.03,Electronics,Online,Returned,UPS,Berlin,High,927.1535,2020-01-01 02:00:00
3,465838,SKU_1760,Desk Lamp,14,2020-01-01 03:00:00,76.68,96586.0,Netherlands,0.14,paypall,11.08,Accessories,Online,Not Returned,Royal Mail,Rome,Low,923.2272,2020-01-01 03:00:00
4,359178,SKU_1386,USB Cable,-30,2020-01-01 04:00:00,-68.11,,United Kingdom,1.501433,Bank Transfer,,Electronics,In-store,Not Returned,FedEx,,Medium,-1024.578137,NaT


In [19]:
latest_purchase_date = data_df['InvoiceDate'].max()
threshold_date = latest_purchase_date - pd.Timedelta(days=90)
data_df['Churn'] = (data_df['LastPurchaseDate'] < threshold_date).astype(int)

In [20]:
customer_features = data_df.groupby('CustomerID').agg(
    total_spent=('TotalSales', 'sum'),
    purchase_count=('InvoiceNo', 'nunique'),
    avg_discount=('Discount', 'mean')
).reset_index()

In [21]:
customer_features = pd.merge(customer_features, data_df[['CustomerID', 'Churn']].drop_duplicates(), on='CustomerID', how='left')

In [22]:
data_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Discount,PaymentMethod,ShippingCost,Category,SalesChannel,ReturnStatus,ShipmentProvider,WarehouseLocation,OrderPriority,TotalSales,LastPurchaseDate,Churn
0,221958,SKU_1964,White Mug,38,2020-01-01 00:00:00,1.71,37039.0,Australia,0.47,Bank Transfer,10.79,Apparel,In-store,Not Returned,UPS,London,Medium,34.4394,2020-01-01 00:00:00,1
1,771155,SKU_1241,White Mug,18,2020-01-01 01:00:00,41.25,19144.0,Spain,0.19,paypall,9.51,Electronics,Online,Not Returned,UPS,Rome,Medium,601.425,2020-01-01 01:00:00,1
2,231932,SKU_1501,Headphones,49,2020-01-01 02:00:00,29.11,50472.0,Germany,0.35,Bank Transfer,23.03,Electronics,Online,Returned,UPS,Berlin,High,927.1535,2020-01-01 02:00:00,1
3,465838,SKU_1760,Desk Lamp,14,2020-01-01 03:00:00,76.68,96586.0,Netherlands,0.14,paypall,11.08,Accessories,Online,Not Returned,Royal Mail,Rome,Low,923.2272,2020-01-01 03:00:00,1
4,359178,SKU_1386,USB Cable,-30,2020-01-01 04:00:00,-68.11,,United Kingdom,1.501433,Bank Transfer,,Electronics,In-store,Not Returned,FedEx,,Medium,-1024.578137,NaT,0


In [23]:
X = customer_features[['total_spent', 'purchase_count', 'avg_discount']]
y = customer_features['Churn']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

In [36]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.43      0.17       391
           1       0.96      0.79      0.86      6687

    accuracy                           0.77      7078
   macro avg       0.53      0.61      0.52      7078
weighted avg       0.91      0.77      0.83      7078

Confusion Matrix:
 [[ 168  223]
 [1434 5253]]
