In [None]:
#r "nuget: ExcelDataReader, 3.6.0"
#r "nuget: ExcelDataReader.DataSet, 3.6.0"
#r "nuget: ManuscriptsProcessor, 0.4.0.1"

In [None]:
using ExcelDataReader;
using System.IO;
using Newtonsoft.Json;
using CorpusDraftCSharp;
using System.Data;
using System.Text;

In [None]:
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

In [None]:
// File you want to preprocess should lie in subdirectory of the directory this notebook is situated in
string fileDirPath = Path.Combine(Directory.GetCurrentDirectory(), "files");
Directory.CreateDirectory(fileDirPath);
string filename = await GetInputAsync("Insert the name of a file you want to preprocess");

In [None]:
var documentID = await GetInputAsync("How many manuscripts do you have in the database?");
var documentName = await GetInputAsync("Insert the name of a current manuscript");
var googleDocPath = await GetInputAsync("Insert a link to the pdf file in google docs or type \"_\"");
var filePath = Path.Combine(fileDirPath, filename);
var doc = new Document(documentID, documentName, filePath, googleDocPath);
doc.documentMetaData = new();
doc.documentMetaData.Add(new Dictionary<string, List<Value>>());
doc.documentMetaData[0]["Тип памятника"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript type"))};
doc.documentMetaData[0]["Объем"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript size"))};
doc.documentMetaData[0]["Материал"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript material"))};
doc.documentMetaData[0]["Датировка"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript creation date"))};
doc.documentMetaData[0]["Место создания"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript creation place"))};
doc.documentMetaData[0]["Место хранения"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript storage place"))};
doc.documentMetaData[0]["Инвентарный номер"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript inventory number"))};

In [None]:
System.Data.DataSet ds = new();

using (var stream = File.Open(filePath, FileMode.Open, FileAccess.Read))
{
    IExcelDataReader reader;
    reader = ExcelDataReader.ExcelReaderFactory.CreateReader(stream);
    var conf = new ExcelDataSetConfiguration()
    {
        ConfigureDataTable = (tableReader) => new ExcelDataTableConfiguration()
        {
            UseHeaderRow = true
        }
    };
    ds = reader.AsDataSet(conf);    
}

In [None]:
for (int i = 0; i < ds.Tables.Count; i++) {
    Console.WriteLine("The next text looks like this:");
    ds.Tables[i].Head(5, true);
    var textName = await GetInputAsync("Insert a name for the text"); 
    var text = new Text(documentID, i.ToString(), textName, filePath);
    text.textMetaData = new();
    text.textMetaData.Add(new Dictionary<string, List<Value>>());
    text.textMetaData[0]["Письмо"] = new List<Value> {new Value(await GetInputAsync("Insert a writing system of the manuscript"))};
    text.textMetaData[0]["Язык"] = new List<Value> {new Value(await GetInputAsync("Insert a manuscript language"))};
    List<string> fullText = new();
    foreach (DataRow r in ds.Tables[i].Rows) {
        fullText.Add(r["Token"].ToString());
    }
    StringBuilder clsText = new();
    clsText.AppendJoin(' ', fullText);
    var cls = new Clause(documentID, i.ToString(), filePath, i.ToString(), clsText.ToString());
    foreach (DataRow r in ds.Tables[i].Rows) {
        var currentRealization = new Realization(documentID, filePath, i.ToString(), i.ToString(), r["Id"].ToString(), r["Token"].ToString(), r["Token"].ToString());
        currentRealization.realizationFields = new();
        currentRealization.realizationFields.Add(new Dictionary<string,List<Value>>());
        foreach (DataColumn column in ds.Tables[i].Columns) {
            if (!(new List<string> {"Id", "Token"}).Contains(column.ColumnName.ToString())) {
                if (r[column.ColumnName].ToString() != "—") currentRealization.realizationFields[0][column.ColumnName] = new List<Value>{new Value(r[column.ColumnName].ToString())};
            }        
        }
        for (int j = 0; j < currentRealization.lexemeOne.Length; j++) {
            currentRealization.letters.Add(new Grapheme(currentRealization, j.ToString(), currentRealization.lexemeOne[j].ToString()));
        }
        cls.realizations.Add(currentRealization);
    }
    text.clauses.Add(cls);
    doc.texts.Add(text);    
}

The next text looks like this:
1 Noi noi pron. — — — — —    
2 Angelo Angelo noun — — sg. — m.    
3 Barbaro Barbaro noun — — sg. — m.    
4 Conte conte noun — — sg. — m.    
5 di di prep. — — — — —    
The next text looks like this:
1 mi mi pron. pers. — — — pl. 1. p. — — — N    
2 knez knez noun — — — — sg. — — — m. N    
3 zadarski zadarski adj. — — — — sg. — — — m. N    
4 šuci sudac noun — — — — pl. — — — m. N    
5 od od prep. — — — — — — — — — —    


In [None]:
string jsonizedDoc = doc.Jsonize();

In [None]:
string finalFileName = doc.documentID + "_" + doc.documentName + ".json";
Directory.CreateDirectory(Path.Combine(Directory.GetCurrentDirectory(), "files", "Output"));
string finalPath = Path.Combine(Directory.GetCurrentDirectory(), "files", "Output", finalFileName);
using (StreamWriter sw = new(new FileStream(finalPath, FileMode.Create, FileAccess.Write))) {
    sw.WriteLine(jsonizedDoc);
}

Error: (5,18): error CS0103: Имя "jsonizedDoc" не существует в текущем контексте.

In [None]:
string docName = await GetInputAsync("Insert the name of a file (without extension)");

In [None]:
Document docToParallelize = new();

In [None]:

using (StreamReader sr = new(new FileStream(Path.Combine(Directory.GetCurrentDirectory(), "files", "Output", docName + ".json"), FileMode.Open, FileAccess.Read))) 
{
    docToParallelize = JsonConvert.DeserializeObject<Document>(sr.ReadToEnd());
}

In [None]:
public class RealizationGroup : List<Realization>
{
    public string documentID { get; set; }
    public string textID { get; set; }
    public string clauseID { get; set; }
    public RealizationGroup()
    {

    }
}

In [None]:
public class ParallelToken : List<RealizationGroup>
{

    public List<RealizationGroup> GetParallels(RealizationGroup source)
    {
        var parallels = this.Where(r => r != source).ToList();
        return parallels;
    }
    public ParallelToken()
    {

    }
}

In [None]:
[Serializable]
public class ParallelDocument {
    [JsonProperty]
    public string id;
    [JsonProperty]
    public string name;
    [JsonProperty]
    public List<Dictionary<string, List<Value>>> documentMetaData;
    [JsonProperty]
    public ParallelClause[,] parallelClauses;
    public List<ParallelToken> parallelTokens;
    public string Jsonize()
    {
        string json = JsonConvert.SerializeObject(this, Formatting.Indented);
        return json;
    }
}

In [None]:
[Serializable]
    public class Realization : IEquatable<Realization>
    {

        #region objectValues
        [JsonProperty]
        public string documentID;
        [JsonProperty]
        public string filePath;
        [JsonProperty]
        public string textID;
        [JsonProperty]
        public string clauseID;
        [JsonProperty]
        public List<Dictionary<string, List<Value>>> realizationFields;
        [JsonProperty]
        public string realizationID;
        [JsonProperty]
        public string lexemeOne;
        [JsonProperty]
        public string lexemeTwo;
        [JsonProperty]
        public List<Grapheme> letters = new List<Grapheme>();
        #endregion

        #region Constructors

        [JsonConstructor]
        public Realization(string _documentID, string _filePath, string _textID, string _clauseID, List<Dictionary<string, List<Value>>> _fields, string _realizationID, string _lexemeOne, string _lexemeTwo, List<Grapheme> _letters)
        {
            this.documentID = _documentID;
            this.filePath = _filePath;
            this.textID = _textID;
            this.clauseID = _clauseID;
            this.realizationFields = _fields;
            this.realizationID = _realizationID;
            this.lexemeOne = _lexemeOne;
            this.lexemeTwo = _lexemeTwo;
            this.letters = _letters;
        }

        public Realization(Clause clause, string _realizationID, string _lexemeOne, string _lexemeTwo)
        {
            this.documentID = clause.documentID;
            this.filePath = clause.filePath;
            this.textID = clause.textID;
            this.clauseID = clause.clauseID;
            this.realizationID = _realizationID;
            this.lexemeOne = _lexemeOne;
            this.lexemeTwo = _lexemeTwo;
        }


        public Realization(string _documentID, string _filePath, string _textID, string _clauseID, string _realizationID, string _lexemeOne, string _lexemeTwo)
        {
            this.documentID = _documentID;
            this.filePath = _filePath;
            this.textID = _textID;
            this.clauseID = _clauseID;
            this.realizationID = _realizationID;
            this.lexemeOne = _lexemeOne;
            this.lexemeTwo = _lexemeTwo;
        }

        public Realization()
        {

        }


        #endregion

        #region publicMethods

        public string Jsonize()
        {
            string realizationToJson = JsonConvert.SerializeObject(this, Formatting.Indented);
            return realizationToJson;
        }

        public string Output()
        {
            Func<string> graphemes = () =>
            {
                string collected = "";
                foreach (var l in letters.OrderBy(graheme => Convert.ToInt32(graheme.documentID)).ThenBy(graheme => Convert.ToInt32(graheme.textID)).ThenBy(grapheme => Convert.ToInt32(grapheme.clauseID)).ThenBy(graheme => Convert.ToInt32(graheme.realizationID)).ThenBy(graheme => Convert.ToInt32(graheme.graphemeID)))
                {
                    collected += l.Output();
                }
                return collected;
            };
            try
            {
                Func<List<Dictionary<string, List<Value>>>, string> fieldsInRawText = (List<Dictionary<string, List<Value>>> fields) =>
                {
                    string result = "";
                    foreach (var optional_tagging in fields)
                    {
                        if (optional_tagging.Count > 0)
                        {
                            foreach (var field in optional_tagging)
                            {
                                result += field.Key;
                                result += ":";
                                for (int i = 0; i < field.Value.Count; i++)
                                {
                                    result += field.Value[i].name;
                                    if (i < field.Value.Count - 1)
                                    {
                                        result += ",";
                                    }
                                }
                                result += ";\n";
                            }
                            result += "***";
                        }
                    }
                    return result;
                };
                Func<List<Dictionary<string, List<Value>>>, string> fieldsInHTML = (List<Dictionary<string, List<Value>>> fields) =>
                {
                    return fieldsInRawText.Invoke(fields).Replace("\n", "<br />");
                };
                return "<span title=\"" + fieldsInRawText.Invoke(realizationFields) + "\" data-content=\"" + fieldsInHTML.Invoke(realizationFields) + "\" class=\"word\" id=\"" + this.documentID + "|" + this.textID + "|" + this.clauseID + "|" + this.realizationID + "\"> " + graphemes.Invoke() + "</span>";
            }
            catch
            {
                return "<span title= \"\" data-content=\"\" class=\"word\" id=\"" + this.documentID + "|" + this.textID +  "|" + this.clauseID + "|" + this.realizationID + "\"> " + graphemes.Invoke() + "</span>";
            }
        }

        public string KeyOutput()
        {
            try
            {
                Func<List<Dictionary<string, List<Value>>>, string> fieldsInRawText = (List<Dictionary<string, List<Value>>> fields) =>
                {
                    string result = "";
                    foreach (var optional_tagging in fields)
                    {
                        if (optional_tagging.Count > 0)
                        {
                            foreach (var field in optional_tagging)
                            {
                                result += field.Key;
                                result += ":";
                                for (int i = 0; i < field.Value.Count; i++)
                                {
                                    result += field.Value[i].name;
                                    if (i < field.Value.Count - 1)
                                    {
                                        result += ",";
                                    }
                                }
                                result += ";\n";
                            }
                            result += "***";
                        }
                    }
                    return result;
                };
                Func<List<Dictionary<string, List<Value>>>, string> fieldsInHTML = (List<Dictionary<string, List<Value>>> fields) =>
                {
                    return fieldsInRawText.Invoke(fields).Replace("\n", "<br />");
                };
                return "<span title=\"" + fieldsInRawText.Invoke(realizationFields) + "\" data-content=\"" + fieldsInHTML.Invoke(realizationFields) + "\" class=\"word\" id=\"" + this.documentID + "|" + this.textID + "|" + this.clauseID + "|" + this.realizationID + "\"> " + this.lexemeTwo + "</span>";
            }
            catch
            {
                return "<span title= \"\" data-content=\"\" class=\"word\" id=\"" + this.documentID + "|" + this.textID + "|" + this.clauseID + "|" + this.realizationID + "\"> " + this.lexemeTwo + "</span>";
            }
        }

        public bool Equals(Realization other)
        {
            if (documentID == other.documentID && textID == other.textID && clauseID == other.clauseID && realizationID == other.realizationID) return true;
            return false;
        }

        #endregion

        #region privateMethods
        #endregion

    }

In [None]:
var dirTexts = Path.Combine(Directory.GetCurrentDirectory(), "files", "Output", "parallelizedDocuments");
Directory.CreateDirectory(dirTexts);
DirectoryInfo directoryTextsInfo = new DirectoryInfo(dirTexts);
ParallelDocument parallelDocument = new ParallelDocument();
parallelDocument.id = directoryTextsInfo.GetFiles().Length.ToString();
parallelDocument.name = docToParallelize.documentName;
parallelDocument.documentMetaData = docToParallelize.documentMetaData;
int maxClausesNumber = docToParallelize.texts.Select(t => t.clauses.Count).Max();
ParallelClause[,] parallelMatrix = new ParallelClause[maxClausesNumber, docToParallelize.texts.Count];
for (int i = 0; i < maxClausesNumber; i++)
{
    for (int j = 0; j < docToParallelize.texts.Count; j++)
    {
        if (docToParallelize.texts[j].clauses.Count > i)
        {
            parallelMatrix[i, j] = new ParallelClause
            {
                textName = docToParallelize.texts[j].textName,
                textMetaData = docToParallelize.texts[j].textMetaData,
                clause = docToParallelize.texts[j].clauses[i]
            };
            continue;
        }
        parallelMatrix[i, j] = new ParallelClause
        {
            textName = docToParallelize.texts[j].textName,
            textMetaData = docToParallelize.texts[j].textMetaData,
            clause = null
        };
    }
}
parallelDocument.parallelClauses = parallelMatrix;
string documentInJSON = JsonConvert.SerializeObject(parallelDocument, Formatting.Indented);

In [None]:
string parallelDataInfo = await GetInputAsync("Insert a name of a document that contains the data for the parallelization");
string parallelDataDoc = Path.Combine(Directory.GetCurrentDirectory(), "files", parallelDataInfo + ".csv");

In [None]:
System.Data.DataSet ds_parallel = new();

using (var stream = File.Open(parallelDataDoc, FileMode.Open, FileAccess.Read))
{
    IExcelDataReader reader;
    reader = ExcelDataReader.ExcelReaderFactory.CreateCsvReader(stream);
    var conf = new ExcelDataSetConfiguration()
    {
        ConfigureDataTable = (tableReader) => new ExcelDataTableConfiguration()
        {
            UseHeaderRow = true
        }
    };
    ds_parallel = reader.AsDataSet(conf);    
}

In [None]:
var parallelData = ds_parallel.Tables[0];
parallelData.Head(5, true);

1 Noi 1 mi  
2 Angelo – –  
3 Barbaro – –  
4 Conte 2 knez  
5, 6 di Zara 3 zadarski  


In [None]:
parallelDocument.parallelTokens = new();
for (int i = 0; i < parallelData.Rows.Count; i++) 
{
    var token2Add = new ParallelToken();
    
    for (int j = 0; j < parallelData.Columns.Count; j = j + 2) 
    {
        RealizationGroup currentGroup = new RealizationGroup();
        var singleTokenIds = parallelData.AsDataView()[i][j].ToString().Split(',').Where(t => t != "").Select(t => t.Trim()).ToList();
        foreach (var id in singleTokenIds)
        {
            if (id != "–") 
            {
                var singleToken = parallelDocument.parallelClauses[0, j/2].clause.realizations.Where(r => r.realizationID == id).FirstOrDefault();
                singleToken.documentID = parallelDocument.id;
                singleToken.textID = "0";
                singleToken.clauseID = (j/2).ToString();
                if (!currentGroup.Contains(singleToken)) currentGroup.Add(singleToken);
            }
        }
        token2Add.Add(currentGroup);        
    }
    if (token2Add.All(rg => rg.Count > 0))
    {
        parallelDocument.parallelTokens.Add(token2Add);
    }    
}

In [None]:
if (!(parallelDocument is null))
{
    var parallelDocumentInJSON = parallelDocument.Jsonize();
    var documentDBFile = Path.Combine(dirTexts, directoryTextsInfo.GetFiles().Length.ToString() + "_" + docToParallelize.documentName + ".json");
    FileStream fs = new FileStream(documentDBFile, FileMode.Create);
    using (StreamWriter w = new StreamWriter(fs))
    {
        w.Write(parallelDocumentInJSON);
    }
}