### ML.NET Multiclass Classification

Recognizes whether a piece of text is written in German, English, Italian, Romanian, French or Spanish.

### NuGet package installation

In [1]:
#r "nuget:Microsoft.ML, 1.4.0"
#r "nuget:XPlot.Plotly, 3.0.1"

### Namespaces

In [2]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using XPlot.Plotly;

### Input class definition

In [3]:
public class MulticlassClassificationData
{
    [LoadColumn(0), ColumnName("Label")]
    public float LanguageClass;

    [LoadColumn(1)]
    public string Text;

    public MulticlassClassificationData(string text)
    {
        Text = text;
    }
}

### Output class definition

In [4]:
public class MulticlassClassificationPrediction
{
    private readonly string[] classNames = { "German", "English", "French", "Italian", "Romanian", "Spanish" };

    public string Text;

    [ColumnName("PredictedLabel")]
    public float Class;

    [ColumnName("Score")]
    public float[] Confidences;

    public string PredictedLanguage => classNames[(int)Class];

    public int Confidence => (int)(Confidences[(int)Class] * 100);
}

### Read the raw data

In [5]:
var mlContext = new MLContext(seed: null);

var trainingData = mlContext.Data.LoadFromTextFile<MulticlassClassificationData>("./Sentences_Training.tsv");

### Prepare the data

In [6]:
var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.Text.FeaturizeText("Features", "Text"))
    .Append(mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy())
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

### Train the model

In [7]:
var model = pipeline.Fit(trainingData);

### Evaluate the model

In [8]:
var testData = mlContext.Data.LoadFromTextFile<MulticlassClassificationData>("./Sentences_Test.tsv");
var scoredData = model.Transform(testData);
var qualityMetrics = mlContext.MulticlassClassification.Evaluate(scoredData);
display(qualityMetrics);

LogLoss,LogLossReduction,MacroAccuracy,MicroAccuracy,TopKAccuracy,TopKPredictionCount,PerClassLogLoss,ConfusionMatrix
0.1061367279736205,0.940763892992148,0.9824893988572754,0.9824865730393302,0,0,"[ 0.17754890146426386, 0.05881043090926731, 0.07523981505497156, 0.1630929051034134, 0.09504758946231957, 0.06692201731154722 ]","{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.9950525664811379, 0.9913896676011213, 0.9905069682892345, 0.9410080183276059, 0.9838, 0.9957515678737608 ], PerClassRecall: [ 0.9654, 0.99000199960008, 0.9861250754071988, 0.9856028794241152, 0.9836032793441312, 0.9842031593681264 ], Counts: [ [ 4827, 34, 15, 111, 12, 1 ], [ 9, 4951, 5, 32, 2, 2 ], [ 7, 7, 4904, 32, 13, 10 ], [ 4, 0, 16, 4929, 46, 6 ], [ 1, 1, 4, 74, 4919, 2 ], [ 3, 1, 7, 60, 8, 4922 ] ], NumberOfClasses: 6 }"


### Visualize logarithmic loss per class

In [12]:
string[] classNames = { "German", "English", "French", "Italian", "Romanian", "Spanish" };

var graph = new Graph.Bar()
{
    y = qualityMetrics.PerClassLogLoss,
    x = classNames
};

var chart = Chart.Plot(graph);

var layout = new Layout.Layout(){ title="Logarithmic Loss per Language (less is better)" };
chart.WithLayout(layout);

display(chart);

### Predict language

In [10]:
var predictionEngine = mlContext.Model.CreatePredictionEngine<MulticlassClassificationData, MulticlassClassificationPrediction>
    (model);
    
var prediction = predictionEngine.Predict(new MulticlassClassificationData("Ceci n'est pas une pipe"));
display(prediction);

prediction = predictionEngine.Predict(new MulticlassClassificationData("Guten Morgen liebe Freunde"));
display(prediction);

PredictedLanguage,Confidence,Text,Class,Confidences
French,86,Ceci n'est pas une pipe,2,"[ 0.031047437, 0.027249021, 0.86063504, 0.030808335, 0.03533128, 0.014928936 ]"


PredictedLanguage,Confidence,Text,Class,Confidences
German,81,Guten Morgen liebe Freunde,0,"[ 0.8122419, 0.05640031, 0.022422126, 0.051359497, 0.027108163, 0.030468086 ]"
