# Importing libraries

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Defining list with file names

In [2]:
files = [
    "EM2015 - szkoły (aktualizacja 09.2021)",
    "EM2015 - szkoły (aktualizacja 09.2022)",
    "EM2015 - szkoły (aktualizacja 09.2023)",
    "EM2023 - szkoły (aktualizacja 09.2023)"
]

# Combining columns with correlated exams

In [3]:
def extractExams(file: str):
    with open(file, "r", encoding="utf-8-sig") as f:
        head = f.readline()
    head = head.split(";")
    head = [element for element in head if element and not element in ["dla całego egzaminu dojrzałości", "\n"]]
    head = [element.rstrip(" (E)").rstrip(" (M)") for element in head]
    return head


for file in files:
    df = pd.read_csv(f"data/matura/{file} - raw.csv", sep=";")
    columns = df.columns.to_list()
    exames = extractExams(f"data/matura/{file} - columns.csv")
    newColumns = []
    pos = 0

    while columns[pos].split(".")[0] != "* liczba zdających":
        column = columns[pos]
        column = column.lstrip("*").lstrip(" ")
        newColumns.append(column)
        pos += 1

    for exam in exames:
        column = columns[pos]
        column = column.split(".")[0].lstrip("*").strip(" ")
        column = f"{exam} | {column}"
        newColumns.append(column)
        pos += 1

        while columns[pos].split(".")[0] != "* liczba zdających":
                column = columns[pos]
                column = column.split(".")[0].lstrip("*").strip(" ")
                column = f"{exam} | {column}"
                newColumns.append(column)
                pos += 1

                if pos >= len(columns):
                     break

    df.columns = newColumns
    df.to_csv(f"data/matura/{file}.csv", index=False, sep=";")

# Combining repeating columns

In [4]:
def combineCols(df: pd.DataFrame, col1: str, col2: str):
    """
    col1 have higher priority
    """
    for i in range(len(df)):
        if pd.isnull(df.loc[i, col1]):
            df.loc[i, col1] = df.loc[i, col2]
            
    df.drop([col2], axis=1, inplace=True)

### EM2015 - 2021

In [5]:
df = pd.read_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2021).csv", sep=";")
columns = []
for col in df.columns.to_list():
    if ".1" in col:
        columns.append(col)

columns

[]

### EM2015 - 2022

In [6]:
df = pd.read_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2022).csv", sep=";")
columns = []
for col in df.columns.to_list():
    if ".1" in col:
        columns.append(col)

In [7]:
for col in columns:
    combineCols(df, col.rstrip(".1"), col)

In [8]:
df.to_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2022).csv", sep=";", index=False)

### EM2015 - 2023

In [9]:
df = pd.read_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2023).csv", sep=";")
columns = []
for col in df.columns.to_list():
    if ".1" in col:
        columns.append(col)

In [10]:
for col in columns:
    combineCols(df, col.rstrip(".1"), col)

In [35]:
df.to_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2023).csv", sep=";", index=False)

### EM2023 - 2023

In [36]:
df = pd.read_csv("data/matura/EM2023 - szkoły (aktualizacja 09.2023).csv", sep=";")
columns = []
for col in df.columns.to_list():
    if ".1" in col:
        columns.append(col)

In [37]:
for col in columns:
    combineCols(df, col.rstrip(".1"), col)

In [38]:
df.to_csv("data/matura/EM2023 - szkoły (aktualizacja 09.2023).csv", sep=";", index=False)

# Combining datasets

In [20]:
df1 = pd.read_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2021).csv", sep=";")
df1['rok'] = 2021
df1['formuła'] = "EM2015"

df2 = pd.read_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2022).csv", sep=";")
df2['rok'] = 2022
df2['formuła'] = "EM2015"

df3 = pd.read_csv("data/matura/EM2015 - szkoły (aktualizacja 09.2023).csv", sep=";")
df3['rok'] = 2023
df3['formuła'] = "EM2015"

df4 = pd.read_csv("data/matura/EM2023 - szkoły (aktualizacja 09.2023).csv", sep=";")
df4['rok'] = 2023
df4['formuła'] = "EM2023"

df = pd.concat([df1, df2, df3, df4])
df.reset_index(drop=True, inplace=True)

columns = df.columns.to_list()
columns = [col.lower() for col in columns]
df.columns = columns

df.to_csv("data/matura/matura_szkoly.csv", index=False, sep=";")

### Combining duplicated columns

In [21]:
df = pd.read_csv("data/matura/matura_szkoly.csv", sep=";")
columns = []
for col in df.columns.to_list():
    if ".1" in col:
        columns.append(col)

In [22]:
for col in columns:
    combineCols(df, col.rstrip(".1"), col)

### Converting colum names

In [23]:
def convertColumns(columns: list):
    # lowercase
    columns = [col.lower() for col in columns]

    # remove polish diacritical characters
    converter = {polish: normal for polish, normal in zip(list("ąęćżźśńłó"), list("aeczzsnlo"))}

    for i in range(len(columns)):
        newCol = ""
        for char in columns[i]:
            if char in converter.keys():
                newCol += converter[char]
            else:
                newCol += char
        columns[i] = newCol

    # remove spaces
    columns = ["_".join(col.split()) for col in columns]

    # remove dasches
    columns = [col.replace("-", "_") for col in columns]

    return columns

In [24]:
df.columns = convertColumns(df.columns.to_list())

In [25]:
df.to_csv("data/matura/matura_szkoly.csv", index=False, sep=";")