In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
df = pd.read_csv("diamonds.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940 non-null  float64
 7   price       53940 non-null  int64  
 8   x           53940 non-null  float64
 9   y           53940 non-null  float64
 10  z           53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


In [4]:
df.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,26970.5,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,15571.281097,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,1.0,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,13485.75,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,26970.5,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,40455.25,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,53940.0,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


# **Data Preprocessing Step**

In [6]:
df = df.drop(columns=["Unnamed: 0"])

In [7]:
num_cols = ["carat", "depth", "table", "price", "x", "y", "z"]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

### Handling missing value

In [8]:
bad_summary = {
    "x == 0": (df["x"] == 0).sum(),
    "y == 0": (df["y"] == 0).sum(),
    "z == 0": (df["z"] == 0).sum(),
    "carat <= 0": (df["carat"] <= 0).sum(),
    "price <= 0": (df["price"] <= 0).sum(),
    "depth <= 0": (df["depth"] <= 0).sum(),
    "table <= 0": (df["table"] <= 0).sum(),
}

pd.DataFrame.from_dict(bad_summary, orient="index", columns=["count"])


Unnamed: 0,count
x == 0,8
y == 0,7
z == 0,20
carat <= 0,0
price <= 0,0
depth <= 0,0
table <= 0,0


In [9]:
bad_mask = (
    (df["x"] == 0) |
    (df["y"] == 0) |
    (df["z"] == 0) |
    (df["carat"] <= 0) |
    (df["price"] <= 0)
)

print("Total bad rows:", bad_mask.sum())

Total bad rows: 20


In [10]:
df.loc[bad_mask].head(20)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2207,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2314,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4791,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5471,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10167,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0
11182,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0
11963,1.0,Very Good,H,VS2,63.3,53.0,5139,0.0,0.0,0.0
13601,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,0.0
15951,1.14,Fair,G,VS1,57.5,67.0,6381,0.0,0.0,0.0
24394,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,0.0


--> **Since the x, y and z is the measure of diamond, so 0 will not never exist on earth, we reegard it as missing value. So we need to handle it.** 

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
bad_xyz = (df["x"] == 0) | (df["y"] == 0) | (df["z"] == 0)

df_before = df.copy()
df_after = df.loc[~bad_xyz].copy()

print("Before shape:", df_before.shape)
print("After shape :", df_after.shape)
print("Dropped rows:", bad_xyz.sum())
print("Dropped %   :", bad_xyz.mean() * 100)


Before shape: (53794, 10)
After shape : (53775, 10)
Dropped rows: 19
Dropped %   : 0.03531992415511024


--> **The missing value is very small, we try to drop it out first.**

In [13]:
df = df_after.copy()

## Outlier

In [14]:
df["log_price"] = np.log1p(df["price"])


In [15]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,log_price
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,5.789960
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,5.789960
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,5.793014
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,5.814131
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,5.817111
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,7.922261
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,7.922261
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,7.922261
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,7.922261


In [16]:
import plotly.express as px

fig1 = px.histogram(df, x="price", title="Distribution of Price")
fig2 = px.histogram(df, x="log_price", title="Distribution of Log(Price)")

fig1.show()
fig2.show()


## 9) Correspondence Analysis (CA)

This section applies **Correspondence Analysis** to pairs of categorical variables to visualize associations.
We will run CA on these contingency tables: 
- cut Ã— color
- cut Ã— clarity
- color Ã— clarity

**Assumption:** your cleaned dataframe is named `df` and contains the categorical columns `cut`, `color`, `clarity`.


In [17]:
def plot_ca(
    ct: pd.DataFrame,
    title: str,
    dim_x: int = 1,
    dim_y: int = 2,
    row_label: str = "cut",       # âœ… legend label for rows
    col_label: str = "color",     # âœ… legend label for columns
    row_point_size: int = 14,
    col_point_size: int = 14,
    size_by_frequency: bool = False,
    min_size: int = 8,
    max_size: int = 28,
    row_symbol: str = "circle",
    col_symbol: str = "star",
    row_color: str = "royalblue",
    col_color: str = "crimson",
    opacity: float = 0.9,
    text_font_size: int = 12
):
    F, G, eig = correspondence_analysis(ct)
    dx, dy = dim_x - 1, dim_y - 1

    inertia = eig / eig.sum()
    xlab = f"Dim {dim_x} ({inertia[dx]*100:.1f}% inertia)"
    ylab = f"Dim {dim_y} ({inertia[dy]*100:.1f}% inertia)"

    # Optional: size by frequency
    if size_by_frequency:
        row_freq = ct.sum(axis=1).to_numpy(dtype=float)
        col_freq = ct.sum(axis=0).to_numpy(dtype=float)
        row_sizes = min_size + (row_freq / row_freq.max()) * (max_size - min_size)
        col_sizes = min_size + (col_freq / col_freq.max()) * (max_size - min_size)
    else:
        row_sizes = row_point_size
        col_sizes = col_point_size

    fig = go.Figure()

    # Row categories (e.g., cut)
    fig.add_trace(go.Scatter(
        x=F[:, dx], y=F[:, dy],
        mode="markers+text",
        text=ct.index.astype(str),
        textposition="top center",
        name=row_label,  # âœ… changed from "Rows"
        marker=dict(
            size=row_sizes, symbol=row_symbol, color=row_color,
            opacity=opacity, line=dict(width=1, color="black")
        ),
        textfont=dict(size=text_font_size)
    ))

    # Column categories (e.g., color)
    fig.add_trace(go.Scatter(
        x=G[:, dx], y=G[:, dy],
        mode="markers+text",
        text=ct.columns.astype(str),
        textposition="top center",
        name=col_label,  # âœ… changed from "Columns"
        marker=dict(
            size=col_sizes, symbol=col_symbol, color=col_color,
            opacity=opacity, line=dict(width=1, color="black")
        ),
        textfont=dict(size=text_font_size)
    ))

    fig.add_hline(y=0)
    fig.add_vline(x=0)

    fig.update_layout(
        title=title,
        xaxis_title=xlab,
        yaxis_title=ylab,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        height=600
    )

    return fig


In [18]:
ct = pd.crosstab(df["cut"], df["color"])
fig = plot_ca(ct, "CA: cut Ã— color", row_label="cut", col_label="color")
fig.show()


NameError: name 'correspondence_analysis' is not defined

In [None]:
ct = pd.crosstab(df["color"], df["clarity"])
fig = plot_ca(ct, "CA: color Ã— clarity", row_label="color", col_label="clarity")
fig.show()


In [None]:
ct = pd.crosstab(df["cut"], df["clarity"])
fig = plot_ca(ct, "CA: cut Ã— clarity", row_label="cut", col_label="clarity")
fig.show()


### How to interpret the CA biplot

- **Points close together** are *more associated* (they co-occur more than expected under independence).
- **Points far from the origin (0,0)** contribute more to the association structure.
- **Row points and column points near each other** indicate stronger association between those categories.
- If most points are near the origin, the two variables are closer to independence.

Tip: In your write-up, describe 2â€“4 strongest associations you see (closest rowâ€“column pairs) and any clear gradient along Dimension 1 or 2.
