In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import pickle
import os

# Step 1: Load cleaned data (already encoded)
df = pd.read_csv("../data/cleaned_sales_data.csv")

# Step 2: Recreate the label encoders from the original data
# We need to recreate the encoders to decode the values later
print("Recreating label encoders from original data...")

# Load original data to recreate encoders
original_df = pd.read_csv("../data/sales_data_sample.csv", encoding='Windows-1252')

# Recreate the label encoders
label_encoders = {}
cat_cols = ['COUNTRY', 'CUSTOMERNAME', 'PRODUCTLINE', 'DEALSIZE']

for col in cat_cols:
    le = LabelEncoder()
    le.fit(original_df[col])
    label_encoders[col] = le

# Step 3: Define High-Value Customers (top 25% of SALES)
threshold = np.percentile(df['SALES'], 75)
df['HIGH_VALUE'] = (df['SALES'] >= threshold).astype(int)

# Step 4: Select Features and Target (data is already encoded)
features = [
    'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER',
    'QTR_ID', 'MONTH_ID', 'YEAR_ID',
    'PRODUCTLINE', 'COUNTRY', 'DEALSIZE'  # These are already encoded
]

X = df[features]
y = df['HIGH_VALUE']

# Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 6: Train Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)

# Step 7: Evaluate Model
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 8: Predict All Rows
df['PREDICTED_HIGH_VALUE'] = clf.predict(X)

# Filter high-value predicted customers
high_value_customers = df[df['PREDICTED_HIGH_VALUE'] == 1].copy()
print(f"\n Total High-Value Customers Identified: {len(high_value_customers)}")

# Step 9: Decode the encoded values back to original strings
print("Decoding categorical variables...")

# Decode the encoded values back to original strings
high_value_customers['PRODUCTLINE_DECODED'] = label_encoders['PRODUCTLINE'].inverse_transform(high_value_customers['PRODUCTLINE'].astype(int))
high_value_customers['COUNTRY_DECODED'] = label_encoders['COUNTRY'].inverse_transform(high_value_customers['COUNTRY'].astype(int))
high_value_customers['DEALSIZE_DECODED'] = label_encoders['DEALSIZE'].inverse_transform(high_value_customers['DEALSIZE'].astype(int))
high_value_customers['CUSTOMERNAME_DECODED'] = label_encoders['CUSTOMERNAME'].inverse_transform(high_value_customers['CUSTOMERNAME'].astype(int))

# Top Product Lines (using decoded values)
top_products = (
    high_value_customers['PRODUCTLINE_DECODED']
    .value_counts()
    .head(3)
    .index
    .tolist()
)

# Top Countries (using decoded values)
top_countries = (
    high_value_customers['COUNTRY_DECODED']
    .value_counts()
    .head(3)
    .index
    .tolist()
)

# Deal Size Distribution (using decoded values)
deal_distribution = high_value_customers['DEALSIZE_DECODED'].value_counts()

# Top Customers by Total Sales (using decoded values)
top_customers = (
    high_value_customers
    .groupby('CUSTOMERNAME_DECODED')['SALES']
    .sum()
    .sort_values(ascending=False)
    .head(10)
)

# Save high-value customers with decoded values for further use
high_value_customers.to_csv("../data/high_value_customers.csv", index=False)

# Step 10: Final Recommendation Output

print("\n Final Project Insight & Recommendation")
print("------------------------------------------------")
print(f"Total unique customers analyzed     : {df['CUSTOMERNAME'].nunique()}")
print(f"High-value customers identified     : {len(high_value_customers)}")
print(f"Percentage of high-value customers  : {100 * len(high_value_customers) / len(df):.1f}%")

print("\n Interpretation:")
print("We used a Decision Tree classifier to predict the top 25% of customers most likely to generate high revenue.")
print("These high-value customers showed specific preferences for product types and are concentrated in certain countries.")

# Final Strategic Suggestion
print("\n Final Business Recommendation for MVP:")
print("Launch targeted ad campaigns based on the following insights:\n")

# Clear product and country recommendations with proper decoding
print("Top Product Lines for High-Value Customers:")
for i, product in enumerate(top_products, 1):
    print(f"   {i}. {product}")

print("\n Top Countries for High-Value Customers:")
for i, country in enumerate(top_countries, 1):
    print(f"   {i}. {country}")

print("\n Top High-Value Customers by Total Sales:")
for i, (customer, sales) in enumerate(top_customers.head(5).items(), 1):
    print(f"   {i}. {customer}: ${sales:,.2f}")

print("\n Deal Size Distribution:")
for deal_size, count in deal_distribution.items():
    percentage = (count / len(high_value_customers)) * 100
    print(f"   • {deal_size}: {count} customers ({percentage:.1f}%)")

print("\n Suggested Marketing Strategy:")
print("   • Focus advertising budget on the top product lines listed above")
print("   • Prioritize marketing campaigns in the top countries listed above")
print("   • Create region-specific campaigns highlighting popular products in each country")

# Create specific product-country combinations for targeted campaigns
print("\n Specific Campaign Recommendations:")
print("   Based on high-value customer patterns, consider these targeted campaigns:")

# Get product-country combinations from high-value customers using decoded values
product_country_combinations = (
    high_value_customers.groupby(['PRODUCTLINE_DECODED', 'COUNTRY_DECODED'])
    .size()
    .sort_values(ascending=False)
    .head(5)
)

for (product, country), count in product_country_combinations.items():
    print(f"   • Promote '{product}' in {country} ({count} high-value transactions)")

print("\n Ad Strategy Suggestions:")
print("   • Emphasize 'Large' and 'Medium' deal-size benefits in messaging")
print("   • Use personalized promotions based on previous purchases")
print("   • Localize campaigns by region and product interest")
print("   • Focus on customers who prefer specific product-country combinations")

print("\nBy applying these recommendations, MVP can focus its marketing spend where it will deliver the highest ROI.")
print("------------------------------------------------\n")

Recreating label encoders from original data...
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       414
           1       0.93      0.83      0.87       151

    accuracy                           0.94       565
   macro avg       0.93      0.90      0.92       565
weighted avg       0.94      0.94      0.94       565

Confusion Matrix:
 [[404  10]
 [ 26 125]]

 Total High-Value Customers Identified: 652
Decoding categorical variables...

 Final Project Insight & Recommendation
------------------------------------------------
Total unique customers analyzed     : 92
High-value customers identified     : 652
Percentage of high-value customers  : 23.1%

 Interpretation:
We used a Decision Tree classifier to predict the top 25% of customers most likely to generate high revenue.
These high-value customers showed specific preferences for product types and are concentrated in certain countries.

 Final Business Rec