In [None]:
#r "nuget: ExcelDataReader, 3.6.0"
#r "nuget: ExcelDataReader.DataSet, 3.6.0"
#r "nuget: ManuscriptsProcessor, 0.4.0.1"

In [None]:
using ExcelDataReader;
using System.IO;
using Newtonsoft.Json;
using CorpusDraftCSharp;
using System.Data;
using System.Text;

In [None]:
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

In [None]:
// File you want to preprocess should lie in subdirectory of the directory this notebook is situated in
string fileDirPath = Path.Combine(Directory.GetCurrentDirectory(), "files");
Directory.CreateDirectory(fileDirPath);
string filename = await GetInputAsync("Insert the name of a file you want to preprocess");

In [None]:
var documentID = await GetInputAsync("How many manuscripts do you have in the database?");
var documentName = await GetInputAsync("Insert the name of a current manuscript");
var googleDocPath = await GetInputAsync("Insert a link to the pdf file in google docs or type \"_\"");
var filePath = Path.Combine(fileDirPath, filename);
var doc = new Document(documentID, documentName, filePath, googleDocPath);
doc.documentMetaData = new();
doc.documentMetaData.Add(new Dictionary<string, List<Value>>());
doc.documentMetaData[0]["Тип памятника"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript type"))};
doc.documentMetaData[0]["Объем"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript size"))};
doc.documentMetaData[0]["Материал"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript material"))};
doc.documentMetaData[0]["Датировка"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript creation date"))};
doc.documentMetaData[0]["Место создания"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript creation place"))};
doc.documentMetaData[0]["Место хранения"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript storage place"))};
doc.documentMetaData[0]["Инвентарный номер"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript inventory number"))};

In [None]:
System.Data.DataSet ds = new();

using (var stream = File.Open(filePath, FileMode.Open, FileAccess.Read))
{
    IExcelDataReader reader;
    reader = ExcelDataReader.ExcelReaderFactory.CreateReader(stream);
    var conf = new ExcelDataSetConfiguration()
    {
        ConfigureDataTable = (tableReader) => new ExcelDataTableConfiguration()
        {
            UseHeaderRow = true
        }
    };
    ds = reader.AsDataSet(conf);    
}

In [None]:
for (int i = 0; i < ds.Tables.Count; i++) {
    Console.WriteLine("The next text looks like this:");
    ds.Tables[i].Head(5, true);
    var textName = await GetInputAsync("Insert a name for the text"); 
    var text = new Text(documentID, i.ToString(), textName, filePath);
    text.textMetaData = new();
    text.textMetaData.Add(new Dictionary<string, List<Value>>());
    text.textMetaData[0]["Письмо"] = new List<Value> {new Value(await GetInputAsync("Insert a writing system of the manuscript"))};
    text.textMetaData[0]["Язык"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript language"))};
    List<string> fullText = new();
    foreach (DataRow r in ds.Tables[i].Rows) {
        fullText.Add(r["Token"].ToString());
    }
    StringBuilder clsText = new();
    clsText.AppendJoin(' ', fullText);
    var cls = new Clause(documentID, i.ToString(), filePath, i.ToString(), clsText.ToString());
    foreach (DataRow r in ds.Tables[i].Rows) {
        var currentRealization = new Realization(documentID, filePath, i.ToString(), i.ToString(), r["Id"].ToString(), r["Token"].ToString(), r["Token"].ToString());
        currentRealization.realizationFields = new();
        currentRealization.realizationFields.Add(new Dictionary<string,List<Value>>());
        foreach (DataColumn column in ds.Tables[i].Columns) {
            if (!(new List<string> {"Id", "Token"}).Contains(column.ColumnName.ToString())) {
                if (r[column.ColumnName].ToString() != "—") currentRealization.realizationFields[0][column.ColumnName] = new List<Value>{new Value(r[column.ColumnName].ToString())};
            }        
        }
        for (int j = 0; j < currentRealization.lexemeOne.Length; j++) {
            currentRealization.letters.Add(new Grapheme(currentRealization, j.ToString(), currentRealization.lexemeOne[j].ToString()));
        }
        cls.realizations.Add(currentRealization);
    }
    text.clauses.Add(cls);
    doc.texts.Add(text);    
}

The next text looks like this:
1 mi Noi mi zam. ličn. — — — mn. 1. l. — — — N 1  
2 knez Conte knez im. — — — — jd. — — — m. N 4  
3 zadarski di Zara zadarski prid. — — — — jd. — — — m. N 6  
4 šuci giudice sudac im. — — — — mn. — — — m. N 8  
5 od di od prij. — — — — — — — — — — 9  
The next text looks like this:
1 Noi mi noi zam. — — — — — 1  
2 Angelo — Angelo im. — — sg. — m. —  
3 Barbaro — Barbaro im. — — sg. — m. —  
4 Conte knez conte im. — — sg. — m. 2  
5 di — di prij. — — — — — —  


In [None]:
string jsonizedDoc = doc.Jsonize();

In [None]:
string finalFileName = doc.documentID + "_" + doc.documentName + ".json";
Directory.CreateDirectory(Path.Combine(Directory.GetCurrentDirectory(), "files", "Output"));
string finalPath = Path.Combine(Directory.GetCurrentDirectory(), "files", "Output", finalFileName);
using (StreamWriter sw = new(new FileStream(finalPath, FileMode.Create, FileAccess.Write))) {
    sw.WriteLine(jsonizedDoc);
}