# Iris data set

In [1]:
#i "nuget:/Users/peet/Sources/csml/CsML/bin/Release"
// #i "nuget:C:\Users\peet.vermeulen\source\repos\csml\CsML\bin\Release"
#r "nuget:CsML,*-*"
using CsML;
using CsML.Utility;
using CsML.Probability;

### Load the data

In [2]:
var mapping = new Dictionary<int, Dictionary<string, double>>();
mapping[4] = new Dictionary<string, double>
{
    { "versicolor", 0 }, {"virginica", 1 }, {"setosa", 2}
};
string inputPath = "/Users/peet/Sources/csml/Tests/Data/iris.csv";
// string inputPath = @"C:\Users\peet.vermeulen\source\repos\csml\Tests\Data\iris.csv";
double[,] data = Matrix.FromCSV(inputPath, mapping, loadFromRow: 1);

Separate features and target columns:

In [3]:
int dataLength = data.GetLength(0);
double[,] features = new double[dataLength, 4];
double[] target = new double[dataLength];
for (int r=0; r < data.GetLength(0); r++)
{
    for (int c=0; c < 4; c++)
    {
        features[r, c] = data[r, c];
    }
    target[r] = data[r, 4];
}

### Profile data

In [4]:
Features.ClassProportions<double>(target)

index,Item1,Item2,Item3
0,0,50,0.3333333333333333
1,1,50,0.3333333333333333
2,2,50,0.3333333333333333


In [5]:
var profiler = new Features.Profiler(features, target);
profiler.HasOutliers(features)

In [6]:
profiler.ColumnsWithOutliers(features)

index,value
0,1


In [7]:
profiler.OutlierIndex(Matrix.GetColumn(features, 1), 1)

index,value
0,15
1,32
2,33
3,60


### Radom classifier

In [27]:
(features, target) = Features.Shuffle(features, target);

In [24]:
var results = new List<double>(){};
var iter = new KFoldIterator(dataLength, 10);
double[,] ftrain, ftest;
double[] ttrain, ttest;
foreach(bool[] f in iter)
{    
    Console.Write(".");
    (ftrain, ftest) = Matrix.Split(features, f);
    (ttrain, ttest) = Array.Split(target, f);
    var rcfier = new Classification.RandomClassifier<double>();
    rcfier.Train(ftrain, ttrain);
    double[] predictions = rcfier.Predict(ftest);
    results.Add(Array.ClassificationAccuracy(ttest, predictions));
}
var mean = results.Average();
results = results.Select(x => Math.Round(x, 4)).ToList();
Console.WriteLine("");
Console.WriteLine(string.Join(", ", results.ToArray()));
Console.WriteLine($"Average {mean}");

..........
0.2667, 0.2, 0.5333, 0.3333, 0.3333, 0.4667, 0.4, 0.2667, 0.3333, 0.3333
Average 0.3466666666666667


### Single decision tree

In [28]:
List<double> results = new List<double>(){};
double[,] ftrain, ftest;
double[] ttrain, ttest;
var iter = new KFoldIterator(150, 10);
int fold = 1;
foreach(bool[] f in iter)
{
    Console.Write($"Fold {fold}: ");
    (ftrain, ftest) = Matrix.Split(features, f);
    (ttrain, ttest) = Array.Split(target, f);
    var props = CsML.Utility.Features.ClassProportions<double>(ttrain)
        .Select(x => Math.Round(x.Item3, 4));
    Console.Write(String.Join(",", props));
    var tree = new CsML.Tree.BinaryTree("classify", Statistics.Gini);
    tree.maxdepth = 15;
    tree.minrows = 3;
    tree.Train(ftrain, ttrain);
    double[] predictions = tree.Predict(ftest);
    var accuracy = Array.ClassificationAccuracy(ttest, predictions);
    Console.WriteLine($" Accuracy: {accuracy:0.0000}");
    results.Add(accuracy);
    fold++;
}
var mean = results.Average();
Console.WriteLine($"Average {mean:0.0000}");

Fold 1: 0.3407,0.3481,0.3111 Accuracy: 0.9333
Fold 2: 0.3407,0.3333,0.3259 Accuracy: 0.9333
Fold 3: 0.3111,0.3481,0.3407 Accuracy: 0.7333
Fold 4: 0.3481,0.3111,0.3407 Accuracy: 0.8667
Fold 5: 0.3185,0.3407,0.3407 Accuracy: 1.0000
Fold 6: 0.3259,0.3333,0.3407 Accuracy: 1.0000
Fold 7: 0.3407,0.3407,0.3185 Accuracy: 1.0000
Fold 8: 0.3259,0.3333,0.3407 Accuracy: 1.0000
Fold 9: 0.3556,0.3259,0.3185 Accuracy: 1.0000
Fold 10: 0.3259,0.3185,0.3556 Accuracy: 0.9333
Average 0.9400


### Random Forest

In [29]:
List<double> results = new List<double>(){};
double[,] ftrain, ftest;
double[] ttrain, ttest;
var iter = new KFoldIterator(150, 10);
int fold = 1;
foreach(bool[] f in iter)
{
    Console.Write($"Fold {fold}: ");
    (ftrain, ftest) = Matrix.Split(features, f);
    (ttrain, ttest) = Array.Split(target, f);
    var props = CsML.Utility.Features.ClassProportions<double>(ttrain)
        .Select(x => Math.Round(x.Item3, 4));
    Console.Write(String.Join(",", props));
    var tree = new CsML.Tree.RandomForest("classify", Statistics.Gini);
    tree.Train(ftrain, ttrain);
    double[] predictions = tree.Predict(ftest);
    var accuracy = Array.ClassificationAccuracy(ttest, predictions);
    Console.WriteLine($" Accuracy: {accuracy:0.0000}");
    results.Add(accuracy);
    fold++;
}
var mean = results.Average();
Console.WriteLine($"Average {mean:0.0000}");

Fold 1: 0.3407,0.3481,0.3111 Accuracy: 0.9333
Fold 2: 0.3407,0.3333,0.3259 Accuracy: 0.9333
Fold 3: 0.3111,0.3481,0.3407 Accuracy: 0.8667
Fold 4: 0.3481,0.3111,0.3407 Accuracy: 0.8667
Fold 5: 0.3185,0.3407,0.3407 Accuracy: 1.0000
Fold 6: 0.3259,0.3333,0.3407 Accuracy: 0.9333
Fold 7: 0.3407,0.3407,0.3185 Accuracy: 1.0000
Fold 8: 0.3259,0.3333,0.3407 Accuracy: 1.0000
Fold 9: 0.3556,0.3259,0.3185 Accuracy: 1.0000
Fold 10: 0.3259,0.3185,0.3556 Accuracy: 0.9333
Average 0.9467
