# Diamonds Predictor Application (Capstone)
This notebook demonstrates the full pipeline:
- EDA & preprocessing
- Classification (clarity)
- Regression (price)
- Clustering (customer segmentation)


In [None]:
import os, sys
sys.path.append("..")  # if running from notebooks/ and package in parent
import pandas as pd
import numpy as np
from diamonds.Analyzer import Analyzer
from diamonds.Classifier import Classifier
from diamonds.Regressor import Regressor
from diamonds.Clustering import Clustering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

DATA_PATH = "data/diamonds.csv"  # adjust if different


In [None]:
an = Analyzer(csv_path=DATA_PATH)
print("Shape:", an.df.shape)
an.df.head()


In [None]:
an.drop_columns(["Unnamed: 0"])
an.shuffle(seed=42)
# encode categorical features
to_encode = [c for c in ["cut","color","clarity"] if c in an.df.columns]
an.encode_features(to_encode)
print("Encoded:", to_encode)


In [None]:
os.makedirs("outputs/plots", exist_ok=True)
p1 = an.plot_correlationMatrix(out_dir="outputs/plots", annot=True)
print("Saved correlation matrix:", p1)
p2 = an.plot_histograms_categorical(out_dir="outputs/plots")
print("Saved histograms:", p2)


## Classification: predict `clarity`
We will train a RandomForest classifier and evaluate accuracy and confusion matrix.


In [None]:
X = an.df.drop(columns=["clarity"])
y = an.df["clarity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sc = StandardScaler()
X_train_s = sc.fit_transform(X_train)
X_test_s = sc.transform(X_test)

clf = Classifier()
clf.fit(X_train_s, y_train, model_name="random_forest")
acc = clf.score(X_test_s, y_test, metric="accuracy")
print("RandomForest accuracy:", acc)
y_pred = clf.predict(X_test_s)
clf.plot_confusionMatrix(y_test, y_pred, out_path="outputs/plots/confusion_clarity.png")
print("Confusion matrix saved.")


## Regression: predict `price`
We train a RandomForest regressor and evaluate R2 and RMSE.


In [None]:
Xr = an.df.drop(columns=["price"])
yr = an.df["price"].astype(float)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)
sc_r = StandardScaler()
Xr_train_s = sc_r.fit_transform(Xr_train)
Xr_test_s = sc_r.transform(Xr_test)

reg = Regressor()
reg.fit(Xr_train_s, yr_train, model_name="random_forest")
r2 = reg.score(Xr_test_s, yr_test, metric="r2")
rmse = reg.score(Xr_test_s, yr_test, metric="RMSE")
print(f"RandomForest R2: {r2:.4f}, RMSE: {rmse:.2f}")


## Clustering
We run k-means on numeric features and inspect cluster sizes.


In [None]:
numeric = an.df.select_dtypes(include=['number'])
from sklearn.preprocessing import StandardScaler
ns = StandardScaler()
X_num = ns.fit_transform(numeric)
cl = Clustering()
cl.fit(X_num, model_name="kmeans", n_clusters=4)
labels = cl.model.labels_
import pandas as pd
print("Cluster sizes:\n", pd.Series(labels).value_counts())


## Save models & final notes
- Saved plots are in `outputs/plots/`
- For full submission, push the repository to GitHub and include this executed notebook (with outputs).
