In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 1. Initialize Spark Session
spark = SparkSession.builder.appName("CustomerKMeansClustering").getOrCreate()

# 2. Load dataset
file_path = "Customer_Data.csv"  # adjust as needed
df = spark.read.csv(file_path, header=True, inferSchema=True)

# 3. Choose two numeric features to cluster on (e.g. BALANCE and PURCHASES)
features = ["BALANCE", "PURCHASES"]

# 4. Assemble features into a single vector column
assembler = VectorAssembler(inputCols=features, outputCol="rawFeatures")
assembled = assembler.transform(df).na.drop()  # drop any rows with nulls

# 5. Standardize the feature vectors
scaler = StandardScaler(
    inputCol="rawFeatures", 
    outputCol="features", 
    withStd=True, 
    withMean=False
)
scaler_model = scaler.fit(assembled)
scaled = scaler_model.transform(assembled)

# 6. Elbow Method: compute WSSSE for k = 2…10
wssse_list = []
print("=== Elbow Method (WSSSE) ===")
for k in range(2, 11):
    km = KMeans(featuresCol="features", k=k, seed=42)
    model = km.fit(scaled)
    cost = model.summary.trainingCost
    wssse_list.append((k, cost))
    print(f"k={k:2d}, WSSSE={cost:.3f}")
print("============================\n")

# 7. Manually set optimal_k based on the elbow plot
optimal_k = 3  # ← update this after you inspect the printed WSSSE

# 8. Fit final KMeans model
km_final = KMeans(featuresCol="features", k=optimal_k, seed=42)
model_final = km_final.fit(scaled)
clusters = model_final.transform(scaled)

# 9. Show a sample of BALANCE, PURCHASES and assigned cluster
print("=== Sample cluster assignments ===")
clusters.select("BALANCE", "PURCHASES", "prediction").show(10, truncate=False)

# 10. Convert to pandas for plotting
clusters_pd = clusters.select("BALANCE", "PURCHASES", "prediction").toPandas()

# 11. Recover original-scale centroids
#     Note: scaler_model.std is a DenseVector of std devs
std_vector = scaler_model.std.toArray()
scaled_centers = np.array(model_final.clusterCenters())
orig_centers = scaled_centers * std_vector  # inverse scaling

# 12. Plot clusters with seaborn + centroids
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=clusters_pd,
    x="BALANCE", y="PURCHASES",
    hue="prediction", palette="tab10", s=50
)
plt.scatter(
    orig_centers[:, 0], orig_centers[:, 1],
    marker='X', s=200, color='red', label='Centroids'
)
plt.title(f"K-Means Clustering on Customer Data (k={optimal_k})")
plt.xlabel("Balance")
plt.ylabel("Purchases")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

# 13. Stop Spark session
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Create a Spark session
spark = SparkSession.builder.appName("PimaRandomForestDecisionTree").getOrCreate()

# Read CSV file (assuming the first row is header and types are inferred)
df = spark.read.csv("/content/pima.csv", header=True, inferSchema=True)
df.show(5)
df.printSchema()
# Assume the last column is the target
columns = df.columns
feature_columns = columns[:-1]  # All columns except the target
target_column = columns[-1]
print("Target column:", target_column)

# Rename the target column to 'label' for ML
df = df.withColumnRenamed(target_column, "label")

# Assemble features into a single vector column 'features'
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(df).select("features", "label")

# Split the dataset into training and testing sets (70/30 split)
train, test = data.randomSplit([0.7, 0.3], seed=42)

##############################
# Random Forest Classifier
##############################

# Initialize and train the Random Forest model
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, seed=42)
rf_model = rf.fit(train)

# Generate predictions on the test set
rf_predictions = rf_model.transform(test)

# Evaluate the model using AUC (Area Under ROC)
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
rf_auc = evaluator_auc.evaluate(rf_predictions)
print("Random Forest AUC:", rf_auc)

# Compute precision, recall, and accuracy using MulticlassMetrics
# MulticlassMetrics expects an RDD of (prediction, label) pairs
rf_pred_rdd = rf_predictions.select("prediction", "label").rdd.map(lambda row: (float(row.prediction), float(row.label)))
metrics_rf = MulticlassMetrics(rf_pred_rdd)
rf_precision = metrics_rf.precision(1.0)  # Assuming the positive class is labeled 1.0
rf_recall = metrics_rf.recall(1.0)
rf_accuracy = metrics_rf.accuracy
print("Random Forest Precision:", rf_precision)
print("Random Forest Recall:", rf_recall)
print("Random Forest Accuracy:", rf_accuracy)

# Print feature importances from the Random Forest model
print("Random Forest Feature Importances:")
for col, imp in zip(feature_columns, rf_model.featureImportances):
    print(f"{col}: {imp}")



In [None]:

##############################
# Decision Tree Classifier
##############################

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", seed=42)
dt_model = dt.fit(train)

# Generate predictions on the test set
dt_predictions = dt_model.transform(test)

# Evaluate the Decision Tree model using AUC
dt_auc = evaluator_auc.evaluate(dt_predictions)
print("Decision Tree AUC:", dt_auc)

# Compute precision, recall, and accuracy for Decision Tree
dt_pred_rdd = dt_predictions.select("prediction", "label").rdd.map(lambda row: (float(row.prediction), float(row.label)))
metrics_dt = MulticlassMetrics(dt_pred_rdd)
dt_precision = metrics_dt.precision(1.0)
dt_recall = metrics_dt.recall(1.0)
dt_accuracy = metrics_dt.accuracy
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree Accuracy:", dt_accuracy)

# Optionally, print the Decision Tree structure for debugging/interpretability
print("Decision Tree Model Structure:")
print(dt_model.toDebugString)

# Stop the Spark session when finished
spark.stop()

Random Forest AUC: 0.8238573021181713
Random Forest Precision: 0.6666666666666666
Random Forest Recall: 0.5797101449275363
Random Forest Accuracy: 0.7537688442211056
Random Forest Feature Importances:
preg: 0.07423774049802578
plas: 0.33818285120085156
pres: 0.048528078434346214
skin: 0.03609356770004231
test: 0.051652411396612724
mass: 0.16982738365242017
pedi: 0.0810134543900141
age: 0.2004645127276872
