In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("../data/cleaned_sales_data.csv")

threshold = np.percentile(df['SALES'], 75)
df['HIGH_VALUE'] = (df['SALES'] >= threshold).astype(int)

features = [
    'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER',
    'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE',
    'CUSTOMERNAME', 'COUNTRY', 'DEALSIZE'
]
X = df[features]
y = df['HIGH_VALUE']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

df['PREDICTED_HIGH_VALUE'] = clf.predict(X)

top_customers = df[df['PREDICTED_HIGH_VALUE'] == 1][['CUSTOMERNAME', 'SALES']]
top_customers = top_customers.sort_values(by='SALES', ascending=False)

print("\nTop Predicted High-Value Customers (sample):")
print(top_customers.head(10))

# top_customers.to_csv("predicted_high_value_customers.csv", index=False)


Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       414
           1       0.93      0.80      0.86       151

    accuracy                           0.93       565
   macro avg       0.93      0.89      0.91       565
weighted avg       0.93      0.93      0.93       565

Confusion Matrix:

[[405   9]
 [ 30 121]]

Top Predicted High-Value Customers (sample):
      CUSTOMERNAME     SALES
598             81  1.000000
744             60  0.886307
53              33  0.846934
1062            33  0.838611
104             86  0.838523
1995            52  0.827722
44              55  0.819193
1133            56  0.798091
188             58  0.793863
30              31  0.772857
