In [None]:
#r "nuget: ExcelDataReader, 3.6.0"
#r "nuget: ExcelDataReader.DataSet, 3.6.0"
#r "nuget: ManuscriptsProcessor, 0.4.0.1"

In [None]:
using ExcelDataReader;
using System.IO;
using Newtonsoft.Json;
using CorpusDraftCSharp;
using System.Data;
using System;

In [None]:
System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);

In [None]:
// File you want to preprocess should lie in subdirectory of the directory this notebook is situated in
string fileDirPath = Path.Combine(Directory.GetCurrentDirectory(), "files");
string filename = "new@HR-DAZD-336_1.xlsx";

In [None]:
System.Data.DataSet ds = new();

using (var stream = File.Open(Path.Combine(fileDirPath, filename), FileMode.Open, FileAccess.Read))
{
    IExcelDataReader reader;
    reader = ExcelDataReader.ExcelReaderFactory.CreateReader(stream);
    var conf = new ExcelDataSetConfiguration()
    {
        ConfigureDataTable = (tableReader) => new ExcelDataTableConfiguration()
        {
            UseHeaderRow = true
        }
    };

    ds = reader.AsDataSet(conf);    
}

In [None]:
ds.Tables[0].Head(5, true);

1 mi Noi mi zam. ličn. — — — mn. 1. l. — — — N 1  
2 knez Conte knez im. — — — — jd. — — — m. N 4  
3 zadarski di Zara zadarski prid. — — — — jd. — — — m. N 6  
4 šuci giudice sudac im. — — — — mn. — — — m. N 8  
5 od di od prij. — — — — — — — — — — 9  


In [None]:
var documentName = "Указ князя Анджело Барбаро судьям Бибине и Боканяца";
var googleDocPath = "_";
var filePath = "_";
var fields = new Dictionary<string, string>();
fields["Тип памятника"] = "Рукопись";
fields["Объем"] = "1 лист";
fields["Материал"] = "бумага";
fields["Датировка"] = "1727 г.";
fields["Место создания"] = "Задар";
fields["Место хранения"] = "Государственный архив Задара";
fields["Инвентарный номер"] = "HR-DAZD-336";

In [None]:
var doc = new Document("2", documentName, filePath, googleDocPath);
doc.documentMetaData = new();
doc.documentMetaData.Add(new Dictionary<string, List<Value>>());


In [None]:
foreach (KeyValuePair<string, string> kv in fields) {
    doc.documentMetaData[0][kv.Key] = new List<Value> {new Value(kv.Value)};
}

In [None]:
var croatianTextName = "Croatian text";
var croatianTextFields = new Dictionary<string, string>();
croatianTextFields["Письмо"] = "глаголица";
croatianTextFields["Язык"] = "хорватский";
var italianTextName = "Italian text";
var italianTextFields = new Dictionary<string, string>();
italianTextFields["Письмо"] = "латиница";
italianTextFields["Язык"] = "итальянский";

In [None]:
var txtCr = new Text("2", "0", croatianTextName, filePath);
var txtIt = new Text("2", "1", italianTextName, filePath);
txtCr.textMetaData = new();
txtCr.textMetaData.Add(new Dictionary<string, List<Value>>());
foreach (KeyValuePair<string, string> kv in croatianTextFields) {
    txtCr.textMetaData[0][kv.Key] = new List<Value> {new Value(kv.Value)};
}
txtIt.textMetaData = new();
txtIt.textMetaData.Add(new Dictionary<string, List<Value>>());
foreach (KeyValuePair<string, string> kv in croatianTextFields) {
    txtIt.textMetaData[0][kv.Key] = new List<Value> {new Value(kv.Value)};
}

In [None]:
public enum Languages {
    CROATIAN = 0,
    ITALIAN = 1
}

In [None]:
for (int i = 0; i < ds.Tables.Count; i++) {
    List<string> fullText = new();
    foreach (DataRow r in ds.Tables[i].Rows) {
        fullText.Add(r["Token"].ToString());
    }
    StringBuilder clsText = new();
    clsText.AppendJoin(' ', fullText);
    var cls = new Clause("2", "0", filePath, i.ToString(), clsText.ToString());
    foreach (DataRow r in ds.Tables[i].Rows) {
        var currentRealization = new Realization("2", filePath, "0", i.ToString(), r["Id"].ToString(), r["Token"].ToString(), r["Token"].ToString());
        currentRealization.realizationFields = new();
        currentRealization.realizationFields.Add(new Dictionary<string,List<Value>>());
        foreach (DataColumn column in ds.Tables[i].Columns) {
            if (!(new List<string> {"Id", "Token"}).Contains(column.ColumnName.ToString())) {
                if (r[column.ColumnName].ToString() != "—") currentRealization.realizationFields[0][column.ColumnName] = new List<Value>{new Value(r[column.ColumnName].ToString())};
            }        
        }
        for (int j = 0; j < currentRealization.lexemeOne.Length; j++) {
            currentRealization.letters.Add(new Grapheme(currentRealization, j.ToString(), currentRealization.lexemeOne[j].ToString()));
        }
        cls.realizations.Add(currentRealization);
    }
    switch(i) {
        case (byte) Languages.ITALIAN: 
            txtIt.clauses.Add(cls);
            break;
        case (byte) Languages.CROATIAN:
        default:
            txtCr.clauses.Add(cls);
            break;
    }
}


In [None]:
doc.texts.Add(txtCr);
doc.texts.Add(txtIt);

In [None]:
string jsonizedDoc = doc.Jsonize();

In [None]:
string finalFileName = doc.documentID + "_" + doc.documentName + ".json";
Directory.CreateDirectory(Path.Combine(Directory.GetCurrentDirectory(), "files", "Output"));
string finalPath = Path.Combine(Directory.GetCurrentDirectory(), "files", "Output", finalFileName);
using (StreamWriter sw = new(new FileStream(finalPath, FileMode.Create, FileAccess.Write))) {
    sw.WriteLine(jsonizedDoc);
}