# ==============================================
# 📊 The AI Roadmap to Profitability:
# Predictive Break-Even Analysis for Smarter Financial Planning
# ==============================================

## 1) Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Install XGBoost
!pip install xgboost
from xgboost import XGBClassifier

import joblib



# Plot styling
sns.set(style="whitegrid", palette="pastel")
plt.rcParams["figure.figsize"] = (8,5)

## 2) Load Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("BreakEvenDB.csv.csv")
print("Total Rows & Columns of the Dataset:", df.shape)
df.head()

## 3) Tableau-Style Analysis (Overview)

In [None]:
print("\n--- Dataset Info ---")
print(df.info())
print("\n--- Missing Values ---")
print(df.isnull().sum())
print("\n--- Descriptive Statistics ---")
print(df.describe())
print("\n--- Unique Values in Each Column ---")
print(df.nunique())


# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

## 4) Exploratory Data Analysis (10 Charts)

In [None]:
# ---- Univariate (5 charts) ----
sns.histplot(df["fixed_costs"], bins=30, kde=True)
plt.title("Distribution of Fixed Costs")
plt.show()

sns.histplot(df["variable_cost_per_unit"], bins=30, kde=True, color="orange")
plt.title("Distribution of Variable Cost per Unit")
plt.show()

sns.histplot(df["price_per_unit"], bins=30, kde=True, color="green")
plt.title("Distribution of Price per Unit")
plt.show()

sns.histplot(df["units_sold"], bins=30, kde=True, color="purple")
plt.title("Distribution of Units Sold")
plt.show()

sns.countplot(x="status", data=df, palette="Set2")
plt.title("Count of Profit/Loss/Break-even")
plt.show()

In [None]:
# ---- Bivariate (3 charts) ----
sns.scatterplot(x="units_sold", y="total_revenue", hue="status", data=df)
plt.title("Units Sold vs Total Revenue")
plt.show()

sns.boxplot(x="status", y="profit_or_loss", data=df, palette="Set3")
plt.title("Profit/Loss Distribution by Status")
plt.show()

sns.scatterplot(x="marketing_spend", y="profit_or_loss", hue="status", data=df)
plt.title("Marketing Spend vs Profit/Loss")
plt.show()

In [None]:
# ---- Multivariate (2 charts) ----
sns.pairplot(df[["fixed_costs","variable_cost_per_unit","price_per_unit","units_sold","profit_or_loss","status"]],
             hue="status", diag_kind="kde")
plt.suptitle("Pairplot of Key Features", y=1.02)
plt.show()

sns.heatmap(df.groupby("status")[["fixed_costs","variable_cost_per_unit","price_per_unit","units_sold"]].mean(),
            annot=True, cmap="YlGnBu")
plt.title("Average Metrics by Business Status")
plt.show()

## 5) Data Preparation & Preprocessing

In [None]:
X = df.drop(columns=["profit_or_loss", "status"])
y = df["status"]

# Encode target labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6) ML Model Implementation & Evaluation

In [None]:
# ---- Random Forest (Baseline) ----
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

# Decode labels back for evaluation
y_test_labels = le.inverse_transform(y_test)
y_pred_rf_labels = le.inverse_transform(y_pred_rf)

print("\n--- Random Forest Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test_labels, y_pred_rf_labels))
print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_rf_labels))

cm_rf = confusion_matrix(y_test_labels, y_pred_rf_labels, labels=le.classes_)
plt.figure(figsize=(6,5))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()

In [None]:
# ---- XGBoost (Advanced) ----
xgb_model = XGBClassifier(objective='multi:softprob', random_state=42)
xgb_model.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Decode labels back for evaluation
y_pred_xgb_labels = le.inverse_transform(y_pred_xgb)

print("\n--- XGBoost Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test_labels, y_pred_xgb_labels))
print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_xgb_labels))

cm_xgb = confusion_matrix(y_test_labels, y_pred_xgb_labels, labels=le.classes_)
plt.figure(figsize=(6,5))
sns.heatmap(cm_xgb, annot=True, fmt="d", cmap="Greens", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - XGBoost")
plt.show()

## 7) Save Models, Scaler & Label Encoder as .pkl for Streamlit

In [None]:
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(xgb_model, "xgboost_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

# Download the files
files.download("random_forest_model.pkl")
files.download("xgboost_model.pkl")
files.download("scaler.pkl")
files.download("label_encoder.pkl")