In [9]:
import pandas as pd
import math as Math

from abc import ABC, abstractmethod

class Transformation(ABC):
    @abstractmethod
    def transform(self) -> pd.DataFrame:
        pass

def filterDataFrame(data: pd.DataFrame, filters: dict = {}) -> pd.DataFrame:
    filteredData: pd.DataFrame = data

    for key in filters.keys():
        if filters[key] != None:
            filteredData = filteredData.loc[filteredData[key] == filters[key]]

    return filteredData

from enum import Enum

class ColName(Enum):
    SEX = "sex"
    UNIT = "unit"
    INSC_TYPE = "insc"
    OFFER = "offer"
    ID = "id"

import os

class OddsRatio(Transformation):
    def transform(
        self,
        table1: pd.DataFrame,
        table2: pd.DataFrame,
        scholarships1: pd.DataFrame,
        scholarships2: pd.DataFrame,
        outputFolder: str,
    ) -> pd.DataFrame:
        # Calculate Odds ratio using inscriptions and scholarhsips
        # The formula follows the values of the following table
        # |               | Changed offer | Didn't change offer | Total |
        # | Scholarship   | a             | b                   | a + b |
        # | No scholarship| c             | d                   | c + d |
        # | Total         | a + c         | b + d               |       |
        # 
        # The odds ratio is calculated as follows:
        # OR = (a * d) / (b * c)
        # Relative Risk = (a / (a + b)) / (c / (c + d))

        # For this purpose, we wont need to perform any filtering on the data

        # obtain ids of alumni that continued regardless of the offer (discards dropouts)

        a = 0
        b = 0
        c = 0
        d = 0

        totalIds = 0
        haveScholarship = 0
        haveNoScholarship = 0
        sameOffer = 0
        changedOffer = 0

        try:
            totalIds = set(table1.merge(
                table2, on=[ColName.ID.value], how="inner"
            )[ColName.ID.value])

            # unique IDs of alumni that have a scholarship

            #scholarships1_ids = set(scholarships1[ColName.ID.value]).intersection(totalIds) #intersecting to make sure that all sets are consistent
            #not using it as I am not sure if it is needed
            scholarships2_ids = set(scholarships2[ColName.ID.value]).intersection(totalIds)

            # haveScholarship = scholarships1_ids.intersection(scholarships2_ids) #Must have scholarship in both years.
            haveScholarship = scholarships2_ids
            haveNoScholarship = totalIds.difference(haveScholarship) #The rest of the alumni without scholarships

            # obtain ids of alumni who stayed in the same offer

            sameOffer = set(table1.merge(
                table2, on=[ColName.ID.value, ColName.OFFER.value], how="inner"
            )[ColName.ID.value]) #Same offer in both years, then the student didnt change offer

            #Obtain ids of alumni who changed offer
            
            changedOffer = totalIds.difference(sameOffer) 

            # Calculate the value of "a" = Changed offer with scholarship
            a = len(haveScholarship.intersection(changedOffer))
            # Calculate the value of "b" = Didn't change offer with scholarship
            b = len(haveScholarship.intersection(sameOffer))
            # Calculate the value of "c" = Changed offer without scholarship
            c = len(haveNoScholarship.intersection(changedOffer))
            # Calculate the value of "d" = Didn't change offer without scholarship
            d = len(haveNoScholarship.intersection(sameOffer))

            # Calculate the odds ratio
            oddsRatio = (a * d) / (b * c)
            # Calculate the relative risk
            relativeRisk = (a / (a + b)) / (c / (c + d))

            #calculate standard deviation and confidence interval
            #standard deviation

            standardDeviation = Math.sqrt(1/a + 1/b + 1/c + 1/d)

            #confidence interval
            # 𝐼𝐶𝑚𝑖𝑛 = 𝑒𝑥𝑝(𝑙𝑜𝑔(𝑂𝑅) − 𝑍 * 𝑆𝐸)
            # 𝐼𝐶𝑚𝑎𝑥 = 𝑒𝑥𝑝(𝑙𝑜𝑔(𝑂𝑅) + 𝑍 * 𝑆𝐸)
            # 𝑍 = 1.96 for 95% confidence interval
            Z = 1.96
            lowerLimit = Math.exp(Math.log(oddsRatio) - Z * standardDeviation)
            upperLimit = Math.exp(Math.log(oddsRatio) + Z * standardDeviation)

            #return everything in a dictionary
            return {
                "totalIds": len(totalIds),
                "haveScholarship": len(haveScholarship),
                "haveNoScholarship": len(haveNoScholarship),
                "sameOffer": len(sameOffer),
                "changedOffer": len(changedOffer),
                "a": a,
                "b": b,
                "c": c,
                "d": d,
                "oddsRatio": oddsRatio,
                "relativeRisk": relativeRisk,
                "standardDeviation": standardDeviation,
                "confidenceInterval": (lowerLimit, upperLimit)
            }
        except:
            print('there was an error')
            return {
                "totalIds": len(totalIds),
                "haveScholarship": len(haveScholarship),
                "haveNoScholarship": len(haveNoScholarship),
                "sameOffer": len(sameOffer),
                "changedOffer": len(changedOffer),
                "a": a,
                "b": b,
                "c": c,
                "d": d,
                "oddsRatio": 0,
                "relativeRisk": 0,
                "standardDeviation": 0,
                "confidenceInterval": (0, 0)
            }


In [10]:

# Test the OddsRatio class using files from test_files/original_files folder

#loop through all the files in the files list
inscFilesList = [
    "student_inscriptions_2018.pickle",
    "student_inscriptions_2019.pickle",
    "student_inscriptions_2020.pickle",
    "student_inscriptions_2021.pickle",
    "student_inscriptions_2022.pickle",
    "student_inscriptions_2023.pickle"
]
progresarSchFilesList = [
    "progresar_2018.pickle",
    "progresar_2019.pickle",
    "progresar_2020.pickle",
    "progresar_2021.pickle",
    "progresar_2022.pickle",
    "progresar_2023.pickle"
]

# Iterate over the list of files for progresar and fetch the results

for i in range(len(progresarSchFilesList) - 1):
    orRes = OddsRatio().transform(
        pd.read_pickle(f"test_files/original_files/{inscFilesList[i]}"),
        pd.read_pickle(f"test_files/original_files/{inscFilesList[i + 1]}"),
        pd.read_pickle(f"test_files/original_files/{progresarSchFilesList[i]}"),
        pd.read_pickle(f"test_files/original_files/{progresarSchFilesList[i + 1]}"),
        "test_files"
    )
    print(f"Results for {inscFilesList[i]} and {inscFilesList[i + 1]}")

    # the orRes contains variables:
    # "totalIds", "haveScholarship", "haveNoScholarship", "sameOffer", "changedOffer", "a", "b", "c", "d", "oddsRatio", "relativeRisk", "standardDeviation", "confidenceInterval"
    # print a table with the values of a, b ,c and d in the following format
    # |               | Changed offer | Didn't change offer | Total |
    # | Scholarship   | a             | b                   | a + b |
    # | No scholarship| c             | d                   | c + d |
    # | Total         | a + c         | b + d               |       |
    
    print("|               | Changed offer | Didn't change offer | Total |")
    print(f"| Scholarship   | {orRes['a']}             | {orRes['b']}                   | {orRes['a'] + orRes['b']} |")
    print(f"| No scholarship| {orRes['c']}             | {orRes['d']}                   | {orRes['c'] + orRes['d']} |")
    print(f"| Total         | {orRes['a'] + orRes['c']}         | {orRes['b'] + orRes['d']}               |       |")
    
    # print the odds ratio, relative risk, standard deviation and confidence interval
    print(f"Odds Ratio: {orRes['oddsRatio']}")
    print(f"Relative Risk: {orRes['relativeRisk']}")
    print(f"Standard Deviation: {orRes['standardDeviation']}")
    print(f"Confidence Interval: {orRes['confidenceInterval']}")
    print("\n")


Results for student_inscriptions_2018.pickle and student_inscriptions_2019.pickle
|               | Changed offer | Didn't change offer | Total |
| Scholarship   | 30             | 1250                   | 1280 |
| No scholarship| 154             | 8517                   | 8671 |
| Total         | 184         | 9767               |       |
Odds Ratio: 1.3273246753246752
Relative Risk: 1.3196530032467533
Standard Deviation: 0.20185205488474633
Confidence Interval: (0.8936289510950695, 1.9715014733652303)


Results for student_inscriptions_2019.pickle and student_inscriptions_2020.pickle
|               | Changed offer | Didn't change offer | Total |
| Scholarship   | 42             | 1674                   | 1716 |
| No scholarship| 155             | 8548                   | 8703 |
| Total         | 197         | 10222               |       |
Odds Ratio: 1.3836512891663777
Relative Risk: 1.3742612226483195
Standard Deviation: 0.1759985645039109
Confidence Interval: (0.9799719602918645, 

In [None]:

inscFilesList = [
    "student_inscriptions_2021.pickle",
    "student_inscriptions_2022.pickle",
    "student_inscriptions_2023.pickle"
]

belgranoSchFilesList = [
    "belgrano_2021.pickle",
    "belgrano_2022.pickle",
    "belgrano_2023.pickle"
]


#Iterate over the list of files for belgrano and fetch the results

for i in range(len(belgranoSchFilesList) - 1):
    orRes = OddsRatio().transform(
        pd.read_pickle(f"test_files/original_files/{inscFilesList[i]}"),
        pd.read_pickle(f"test_files/original_files/{inscFilesList[i + 1]}"),
        pd.read_pickle(f"test_files/original_files/{belgranoSchFilesList[i]}"),
        pd.read_pickle(f"test_files/original_files/{belgranoSchFilesList[i + 1]}"),
        "test_files"
    )
    print(f"Results for {inscFilesList[i]} and {inscFilesList[i + 1]}")

    # the orRes contains variables:
    # "totalIds", "haveScholarship", "haveNoScholarship", "sameOffer", "changedOffer", "a", "b", "c", "d", "oddsRatio", "relativeRisk", "standardDeviation", "confidenceInterval"
    # print a table with the values of a, b ,c and d in the following format
    # |               | Changed offer | Didn't change offer | Total |
    # | Scholarship   | a             | b                   | a + b |
    # | No scholarship| c             | d                   | c + d |
    # | Total         | a + c         | b + d               |       |

    
    print("|               | Changed offer | Didn't change offer | Total |")
    print(f"| Scholarship   | {orRes['a']}             | {orRes['b']}                   | {orRes['a'] + orRes['b']} |")
    print(f"| No scholarship| {orRes['c']}             | {orRes['d']}                   | {orRes['c'] + orRes['d']} |")
    print(f"| Total         | {orRes['a'] + orRes['c']}         | {orRes['b'] + orRes['d']}               |       |")
    
    # print the odds ratio, relative risk, standard deviation and confidence interval
    print(f"Odds Ratio: {orRes['oddsRatio']}")
    print(f"Relative Risk: {orRes['relativeRisk']}")
    print(f"Standard Deviation: {orRes['standardDeviation']}")
    print(f"Confidence Interval: {orRes['confidenceInterval']}")
    print("\n")

Results for student_inscriptions_2021.pickle and student_inscriptions_2022.pickle
|               | Changed offer | Didn't change offer | Total |
| Scholarship   | 9             | 455                   | 464 |
| No scholarship| 212             | 11386                   | 11598 |
| Total         | 221         | 11841               |       |
Odds Ratio: 1.0623470868753888
Relative Risk: 1.061137768379961
Standard Deviation: 0.3436767690681834
Confidence Interval: (0.5416551286054585, 2.083579150996433)


Results for student_inscriptions_2022.pickle and student_inscriptions_2023.pickle
|               | Changed offer | Didn't change offer | Total |
| Scholarship   | 3             | 451                   | 454 |
| No scholarship| 178             | 11775                   | 11953 |
| Total         | 181         | 12226               |       |
Odds Ratio: 0.4400333839906326
Relative Risk: 0.4437336039202099
Standard Deviation: 0.5841690949131648
Confidence Interval: (0.1400331486579253, 1.38