In [7]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np


class Transformation(ABC):
    @abstractmethod
    def transform(self) -> pd.DataFrame:
        pass


class TransformationSequence(Transformation):
    def __init__(self, transformations):
        self.transformationSequence = transformations

    def transform(self) -> pd.DataFrame:
        for t in self.transformationSequence:
            t.transform(self)
def filterDataFrame(data: pd.DataFrame, filters: dict = {}) -> pd.DataFrame:
    filteredData: pd.DataFrame = data

    for key in filters.keys():
        if filters[key] != None:
            filteredData = filteredData.loc[filteredData[key] == filters[key]]

    return filteredData


def columnUniqueValues(data: pd.DataFrame, column: str, filters: dict = {}):
    return filterDataFrame(data, filters)[column].unique()

from enum import Enum

class ColName(Enum):
    SEX = "sex"
    UNIT = "unit"
    INSC_TYPE = "insc"
    OFFER = "offer"
    ID = "id"

import os

class StudentMovements(Transformation):
    def transform1(self, table1, table2, filters1, filters2, outputFolder) -> pd.DataFrame:
        table1 = filterDataFrame(table1, filters1)
        table2 = filterDataFrame(table2, filters2)

        activeOnAnyOffer = table1.merge(table2, on=ColName.ID.value, how="inner")

        activeOnSameOffer = table1.merge(
            table2, on=[ColName.ID.value, ColName.OFFER.value], how="inner"
        )

        activeOnAnyOffer = columnUniqueValues(activeOnAnyOffer, ColName.ID.value)

        activeOnSameOffer = columnUniqueValues(activeOnSameOffer, ColName.ID.value)

        movements = np.setdiff1d(activeOnAnyOffer, activeOnSameOffer)

        year1Enrolled = columnUniqueValues(
            table1, ColName.ID.value  # Table 1 is already filtered
        )

        return {
            "Enrolled": year1Enrolled.size,
            "Reenrolled": activeOnSameOffer.size,
            "Movements": movements.size,
            "NoData": year1Enrolled.size - activeOnSameOffer.size - movements.size,
        }

    def transform(self, table1, table2, filters1, filters2, outputFolder) -> pd.DataFrame:
        table1 = filterDataFrame(table1, filters1)
        table2 = filterDataFrame(table2, filters2)

        table1.to_excel(os.path.join(outputFolder, "table1.xlsx"))
        table2.to_excel(os.path.join(outputFolder, "table2.xlsx"))

        activity = table1.merge(table2, on=ColName.ID.value, how="inner")

        activity = activity.rename(
            columns={
                ColName.UNIT.value + "_x": "DX[" + ColName.UNIT.value + "]",
                ColName.OFFER.value + "_x": "DX[" + ColName.OFFER.value + "]",
                ColName.INSC_TYPE.value + "_x": "DX[" + ColName.INSC_TYPE.value + "]",
                ColName.SEX.value + "_x": "DX[" + ColName.SEX.value + "]",

                ColName.UNIT.value + "_y": "DXY[" + ColName.UNIT.value + "]",
                ColName.OFFER.value + "_y": "DXY[" + ColName.OFFER.value + "]",
                ColName.INSC_TYPE.value + "_y": "DXY[" + ColName.INSC_TYPE.value + "]",
                ColName.SEX.value + "_y": "DXY[" + ColName.SEX.value + "]",
            }
        )

        activity.to_excel(os.path.join(outputFolder, "activity.xlsx"))

        # We need to do these conversions because a lot of operations work differently if the columns are of type categorical

        """
        activity[ColName.OFFER.value + "_x"] = activity[
            ColName.OFFER.value + "_x"
        ].astype(str)

        activity[ColName.OFFER.value + "_y"] = activity[
            ColName.OFFER.value + "_y"
        ].astype(str)
        """

        differentActivity: pd.DataFrame = activity.loc[
            activity["DX[" + ColName.OFFER.value + "]"] != activity["DXY[" + ColName.OFFER.value + "]"]
        ]

        differentActivity.to_excel(os.path.join(outputFolder, "differentActivity.xlsx"))

        sameActivity: pd.DataFrame = activity.loc[
            activity["DX[" + ColName.OFFER.value + "]"] == activity["DXY[" + ColName.OFFER.value + "]"]
        ]

        sameActivity.to_excel(os.path.join(outputFolder, "sameActivity.xlsx"))

        differentActivityIds = columnUniqueValues(differentActivity, ColName.ID.value)
        
        pd.DataFrame(differentActivityIds).to_excel(os.path.join(outputFolder, "differentActivityIds.xlsx"))

        sameActivityIds = columnUniqueValues(sameActivity, ColName.ID.value)

        pd.DataFrame(sameActivityIds).to_excel(os.path.join(outputFolder, "sameActivityIds.xlsx"))

        inscriptions = columnUniqueValues(table1, ColName.ID.value)

        pd.DataFrame(inscriptions).to_excel(os.path.join(outputFolder, "inscriptions.xlsx"))

        differentActivityIds = np.setdiff1d(differentActivityIds, sameActivityIds)

        pd.DataFrame(differentActivityIds).to_excel(os.path.join(outputFolder, "differentActivityIds2.xlsx"))

        result = {
            "Enrolled": inscriptions.size,
            "Reenrolled": sameActivityIds.size,
            "Movements": differentActivityIds.size,
            "NoData": inscriptions.size
            - differentActivityIds.size
            - sameActivityIds.size,
        }

        resultForExcel = {
            "Enrolled": [inscriptions.size],
            "Reenrolled": [sameActivityIds.size],
            "Movements": [differentActivityIds.size],
            "NoData": [inscriptions.size - differentActivityIds.size - sameActivityIds.size],
        }

        pd.DataFrame(resultForExcel).to_excel(os.path.join(outputFolder, "result.xlsx"))

        return result


In [8]:
transformer = StudentMovements();

tab1 = pd.read_pickle('./test_files/student_inscriptions_xxxx.pickle')
tab2 = pd.read_pickle('./test_files/student_inscriptions_yyyy.pickle')

#create folder for output
outputFolder = os.path.join("./test_output" , "student_movements_test")
if not os.path.exists(outputFolder):
    os.makedirs(outputFolder)

transformer.transform(tab1,tab2,{},{}, outputFolder)

{'Enrolled': 10, 'Reenrolled': 1, 'Movements': 6, 'NoData': 3}