In [22]:
#r "nuget:Microsoft.ML"
#r "nuget:Microsoft.ML.Recommender"
#r "nuget:Microsoft.Data.Analysis"
#r "nuget:SandDance.InteractiveExtension"
#r "nuget:CsvHelper"

In [23]:
using System.Text.Json.Serialization;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using System;
using System.IO;
using System.Globalization;
using Microsoft.Data.Analysis;
using SandDance.InteractiveExtension;
using Microsoft.DotNet.Interactive.Formatting.TabularData;
using CsvHelper;

In [24]:
static T? GetValue<T>(ValueGetter<T>? valueGetter)
{
    T? value = default;
    if (valueGetter is not null) valueGetter(ref value!);
    return value;
}

static TabularDataResource ToTabularDataResource(this IDataView source)
{
    var fields = source.Schema.ToDictionary(column => column.Name, column => column.Type.RawType);
    var data = new List<Dictionary<string, object?>>();

    var cursor = source.GetRowCursor(source.Schema);

    while (cursor.MoveNext())
    {
        var rowObj = new Dictionary<string, object?>();

        foreach (var column in source.Schema)
        {
            var type = column.Type.RawType;
            var getGetterMethod = cursor.GetType()
                .GetMethod(nameof(cursor.GetGetter))
                ?.MakeGenericMethod(type);

            var valueGetter = getGetterMethod?.Invoke(cursor, new object[] { column });

            object? value = GetValue((dynamic)valueGetter!);

            if (value is ReadOnlyMemory<char>)
            {
                value = value.ToString();
            }

            rowObj.Add(column.Name, value);
        }

        data.Add(rowObj);
    }

    var schema = new TableSchema();

    foreach (var (fieldName, fieldValue) in fields)
    {
        schema.Fields.Add(new TableSchemaFieldDescriptor(fieldName, fieldValue.ToTableSchemaFieldType()));
    }

    return new TabularDataResource(schema, data);
}










In [25]:
async Task<string> WriteCsv<T>(string fileName,List<T> ratings)
{
    var outputDir = Path.Combine(Directory.GetCurrentDirectory(),"Data","Result");

    if(!Directory.Exists(outputDir))
    {
        Directory.CreateDirectory(outputDir);
    }

    var outputPath = Path.Combine(outputDir,fileName);

    var stream = new FileStream(outputPath,FileMode.Create);

    var writerStream = new StreamWriter(stream,Encoding.Unicode);

    var csvWriter = new CsvWriter(writerStream,CultureInfo.CurrentCulture);

    csvWriter.WriteHeader<T>();
    await csvWriter.NextRecordAsync();
    await csvWriter.WriteRecordsAsync(ratings);
    await csvWriter.FlushAsync();

    Console.WriteLine("Ghi file thành công");
    writerStream.Close();
    return outputPath;
}

In [26]:
public class AmazonRating
{
    [LoadColumn(0)]public string UserId { get; set; }

    [LoadColumn(1)]public string Asin { get; set; }

    [LoadColumn(2)]public string ParentAsin { get; set; }

    [LoadColumn(3)]public string Title { get; set; }

    [LoadColumn(4)]public string Text { get; set; }
    
    [LoadColumn(5)]public float Rating { get; set; }

    [LoadColumn(6)]public long Timestamp { get; set; }

    [LoadColumn(7)]public int HelpfulVote { get; set; }

    [LoadColumn(8)]public bool VerifiedPurchase { get; set; }
}

In [27]:
var mlContext = new MLContext();
var fileName = "Amazon_Fashion.csv" ;
var path = Path.Combine(Directory.GetCurrentDirectory(),"Data",fileName);

var rawData = mlContext.Data.LoadFromTextFile<AmazonRating>(
    path:path, 
    hasHeader : true,
    separatorChar:',',
    allowQuoting:true,
    allowSparse:true
    );
var rawDataEnumerable = mlContext.Data.CreateEnumerable<AmazonRating>(rawData,false);
Console.WriteLine($"Số dòng trong dữ liệu đầu vào: {rawDataEnumerable.LongCount()}");

Số dòng trong dữ liệu đầu vào: 2500939


In [None]:
var helpfulVoteGreater = 1;
var userRatingHelpfulVoteGreater = 10;

var filterFunc = (AmazonRating rating) => 
        rating.HelpfulVote >= userRatingHelpfulVoteGreater;

var filteredAsins = from rating in rawDataEnumerable
                    where rating.HelpfulVote >= helpfulVoteGreater
                    group rating by rating.Asin into g
                    where g.Count() >= userRatingHelpfulVoteGreater
                    select g.Key;

var latestRankedList = 
    from rating in rawDataEnumerable
    join asin in filteredAsins on rating.Asin equals asin
    where rating.HelpfulVote >= helpfulVoteGreater
    orderby rating.Timestamp descending
    group rating by new { rating.UserId, rating.Asin } into g
    let rowNumber = g.Select((r, i) => new { r, i }).FirstOrDefault(x => x.r.Timestamp == g.Max(r => r.Timestamp)).i + 1
    where rowNumber == 1
    select g.FirstOrDefault();

var filteredData = mlContext.Data.LoadFromEnumerable<AmazonRating>(latestRankedList);

Console.WriteLine($"Số bản ghi còn lại sau khi loại bỏ: {latestRankedList.LongCount()}");

In [None]:
class AmazonRatingInput
{
    public string UserId { get; set; }

    public string Asin { get; set; }

    public float Rating { get; set; }
}

In [None]:
var inputEnumerable = latestRankedList.Select(
    rating => new AmazonRatingInput()
    {
        UserId = rating.UserId,
        Asin = rating.Asin,
        Rating = rating.Rating
    });

var inputData = mlContext.Data.LoadFromEnumerable<AmazonRatingInput>(inputEnumerable);

In [None]:
var shuffledData = mlContext.Data.ShuffleRows(inputData);
var split = mlContext.Data.TrainTestSplit(shuffledData,testFraction:0.1);

var trainData = split.TrainSet;
var testData = split.TestSet;

Console.WriteLine($"Số bản ghi train: {mlContext.Data.CreateEnumerable<AmazonRatingInput>(trainData,false).LongCount()}");
Console.WriteLine($"Số bản ghi test: {mlContext.Data.CreateEnumerable<AmazonRatingInput>(testData,false).LongCount()}")

In [None]:
var pipeline =
    mlContext.Transforms.Conversion
    .MapValueToKey(
        outputColumnName: nameof(AmazonRatingInput.UserId), 
        inputColumnName: nameof(AmazonRatingInput.UserId)   
        )
    .Append(
        mlContext.Transforms.Conversion
            .MapValueToKey(
                outputColumnName:  nameof(AmazonRatingInput.Asin), 
                inputColumnName: nameof(AmazonRatingInput.Asin)
            )
        );

var options = new MatrixFactorizationTrainer.Options
{
    MatrixColumnIndexColumnName = nameof(AmazonRatingInput.UserId),
    MatrixRowIndexColumnName = nameof(AmazonRatingInput.Asin),
    LabelColumnName = nameof(AmazonRatingInput.Rating),
    NumberOfIterations = 20,
    ApproximationRank = 100,
};

var trainerEstimator = pipeline.Append(mlContext.Recommendation().Trainers.MatrixFactorization(options));

ITransformer model = trainerEstimator.Fit(trainData);

In [None]:
Console.WriteLine("Kết quả sau khi chạy: ");
var prediction = model.Transform(testData);

var metrics = mlContext.Regression.Evaluate(prediction, labelColumnName: "Rating");

Console.WriteLine("Root Mean Squared Error : " + metrics.RootMeanSquaredError.ToString());
Console.WriteLine("RSquared: " + metrics.RSquared.ToString());

In [None]:
class AmazonRatingPrediction{
    public float Rating { get; set; }
    public float Score{ get; set; }
}
void UseModelForSinglePrediction(MLContext mlContext, ITransformer model,AmazonRating inputRating)
{
    var predictionEngine = mlContext.Model.CreatePredictionEngine<AmazonRating, AmazonRatingPrediction>(model);

    var resultRatingPrediction = predictionEngine.Predict(inputRating);

    display(resultRatingPrediction);
    
    if (Math.Round(resultRatingPrediction.Score, 1) >= 3)
    {
        Console.WriteLine("Sản phẩm  " + inputRating.Asin + " nên gợi ý cho " + inputRating.UserId);
    }
    else
    {
        Console.WriteLine("Sản phẩm " + inputRating.Asin + " không nên gợi ý cho " + inputRating.UserId);
    }
}

In [None]:
var random = new Random();

var filteredDataEnumerable = mlContext.Data.CreateEnumerable<AmazonRatingInput>(filteredData,false);

var filteredDataCount = filteredDataEnumerable.Count();

var randomAsin = filteredDataEnumerable.Skip(random.Next(filteredDataCount - 1)).Take(1).First().Asin;
var randomUserId = filteredDataEnumerable.Skip(random.Next(filteredDataCount - 1)).Take(1).First().UserId;
var input = new AmazonRating()
{
    Asin = randomAsin,
    UserId = randomUserId
};
UseModelForSinglePrediction(mlContext,model,input);

In [21]:
if(filteredDataCount <= 50000)
{
    var trainEnumerable = mlContext.Data.CreateEnumerable<AmazonRatingInput>(trainData,false).ToList();
    var trainCsvPath = await WriteCsv<AmazonRating>("train.csv",latestRankedList.ToList());

    var df = DataFrame.LoadCsv(trainCsvPath);
    df.ToTabularDataResource().ExploreWithSandDance().Display();
}
else{
    Console.WriteLine("Dữ liệu quá lớn, không thể vẽ biểu đồ");
}

Ghi file thành công
