## ML.NET Clustering

Divides shopping mall customers in 5 clusters, based on annual income and spending score.

### NuGet package installation

In [3]:
#r "nuget:Microsoft.ML, 1.4.0"
#r "nuget:XPlot.Plotly, 3.0.1"

### Namespaces

In [4]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using XPlot.Plotly;

### Input class definition

In [5]:
public class ClusteringData
{
    public float AnnualIncome;

    public float SpendingScore;
}

### Output class definition

In [6]:
public class ClusteringPrediction
{
    [ColumnName("PredictedLabel")]
    public uint PredictedCluster;

    [ColumnName("Score")]
    public float[] Distances;

    public float AnnualIncome;

    public float SpendingScore;
}

### Read the raw data

In [7]:
var mlContext = new MLContext(seed: null);

var readerOptions = new TextLoader.Options()
{
    Separators = new[] { ',' },
    HasHeader = true,
    Columns = new[]
        {
            new TextLoader.Column("AnnualIncome", DataKind.Single, 3),
            new TextLoader.Column("SpendingScore", DataKind.Single, 4),
        }
};

var dataView = mlContext.Data.LoadFromTextFile("./Mall_Customers.csv", readerOptions);

### Visualize the raw data

In [8]:
// Table
display(h4("Some data"));
var rawData = mlContext.Data.CreateEnumerable<ClusteringData>(dataView, false);
display(rawData.Take(5).ToList());

// Plot
var rawChart = Chart.Plot(new Graph.Scatter() 
    {
        x = rawData.Select(r => r.SpendingScore), 
        y = rawData.Select(r => r.AnnualIncome),
        mode = "markers"
    });
var layout = new Layout.Layout() { title = "Raw data"};
rawChart.WithLayout(layout);
rawChart.Width = 500;
rawChart.Height = 500;
rawChart.WithYTitle("Income");
rawChart.WithXTitle("Spending Score");
display(rawChart);

index,AnnualIncome,SpendingScore
0,15,39
1,15,81
2,16,6
3,16,77
4,17,40


### Prepare the data

In [9]:
var pipeline = mlContext.Transforms.Concatenate("Features", new[] { "AnnualIncome", "SpendingScore" })
            .Append(mlContext.Clustering.Trainers.KMeans(
                featureColumnName: "Features",
                numberOfClusters: 5));

### Train the model

In [10]:
var model = pipeline.Fit(dataView);

### Calculate clusters for the training data

In [11]:
var clusters = model.Transform(dataView);

### Visualize the clusters

In [12]:
var clusterData = mlContext.Data.CreateEnumerable<ClusteringPrediction>(clusters, false);
var clusterScatter = new Graph.Scattergl() 
    {
        x = clusterData.Select(r => r.SpendingScore), 
        y = clusterData.Select(r => r.AnnualIncome),
        mode = "markers",
        marker = new Graph.Marker() 
        { 
            color = clusterData.Select(r => r.PredictedCluster), 
            colorscale = "Jet",
            cmin = 1,
            cmax = 5
        }
    };
var clusterChart = Chart.Plot(clusterScatter);
clusterChart.WithLayout(new Layout.Layout() { title = "Clusters"});
clusterChart.Width = 500;
clusterChart.Height = 500;
clusterChart.WithYTitle("Income");
clusterChart.WithXTitle("Spending Score");
clusterChart.WithLegend(false);
display(clusterChart);

### Predict cluster

In [13]:
var predictionEngine = mlContext.Model.CreatePredictionEngine<ClusteringData, ClusteringPrediction>(model);
var clusteringData = new ClusteringData
{
    AnnualIncome = 70,
    SpendingScore = 70
};
var result = predictionEngine.Predict(clusteringData);

display (h4("Prediction"));
display(result);

PredictedCluster,Distances,AnnualIncome,SpendingScore
5,"[ 635.68945, 2047.7524, 3128.1382, 4318.84, 420.61523 ]",70,70


### Visualize predicted cluster

In [14]:
display(result);
var results = new List<ClusteringPrediction> { result };
var predictionScatter = new Graph.Scattergl
    {
        x = results.Select(r => r.SpendingScore), 
        y = results.Select(r => r.AnnualIncome),
        //mode = "markers",
        marker = new Graph.Marker() 
        { 
            color = results.Select(r => r.PredictedCluster), 
            colorscale = "Jet", 
            size = 20, 
            symbol = 22, // Star Diamond
            cmin = 1,
            cmax = 5
        }
    };

var scatters = new List<Graph.Scattergl> { predictionScatter, clusterScatter };

var clusterChart = Chart.Plot(scatters);
clusterChart.WithLayout(new Layout.Layout() { title = "Prediction"});
clusterChart.Width = 500;
clusterChart.Height = 500;
clusterChart.WithYTitle("Income");
clusterChart.WithXTitle("Spending Score");
clusterChart.WithLegend(false);
display(clusterChart);

PredictedCluster,Distances,AnnualIncome,SpendingScore
5,"[ 635.68945, 2047.7524, 3128.1382, 4318.84, 420.61523 ]",70,70
