In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import AgglomerativeClustering

# Step 1: Load dataset
data = pd.read_csv("hitters.csv")

# Step 2: Data Preprocessing
# Remove missing values (if any)
data = data.dropna(subset=['CRuns'])

# Encode non-numeric columns if needed
if 'Name' in data.columns and data['Name'].dtype == 'object':
    le = LabelEncoder()
    data['Name'] = le.fit_transform(data['Name'])

# Step 3: Select the feature(s)
X = data[['CRuns']]  # or data[['CRuns', 'CHits']] if available

# Step 4: Feature scaling (important for clustering)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Create Dendrogram
plt.figure(figsize=(10, 5))
dendrogram = sch.dendrogram(sch.linkage(X_scaled, method='ward'))
plt.title('Dendrogram for Hitters')
plt.xlabel('Players')
plt.ylabel('Euclidean Distances')
plt.show()

# Step 6: Apply Agglomerative Clustering
hc = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
y_hc = hc.fit_predict(X_scaled)

# Step 7: Add cluster info back to dataset
data['Cluster'] = y_hc

# Step 8: Display results
print(data[['Name', 'CRuns', 'Cluster']].head())

# Optional visualization if 2 features
if X.shape[1] == 2:
    plt.figure(figsize=(8, 5))
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_hc, cmap='viridis')
    plt.xlabel('Career Runs (scaled)')
    plt.ylabel('Career Hits (scaled)')
    plt.title('Hierarchical Clustering of Players')
    plt.show()
