In [1]:
import pandas as pd
pd.set_option("display.max_columns", 50)

In [2]:
class Source(object):
    
    def __init__(self, source_df, columns_mapping,
                join_keys=["PHA_CIP_C13"]):
        """A class that wraps the new source data.
        
        Parameters
        ----------
            source_df : DataFrame
                Extra data that will be added to the IR_PHA_R table.
            columns_mapping : dict
                Maps the columns of source_df to their correspondent
                columns in IR_PHA_R.
            join_keys : list
                Columns used to join to IRPHAR table.
        """
        self.columns_mapping = columns_mapping
        self.df = source_df.rename(columns=self.columns_mapping).copy()
        self.join_keys = join_keys
        self.shared_columns = list(self.columns_mapping.values())

        
class IRPHAR(object):
    
    def __init__(self, irphar_df):
        """A class that wraps the ```IR_PHA_R``` value table.
        
        Parameters
        ----------
            irphar_df: DataFrame
                IR_PHA_R table.
        """
        self.df = irphar_df

        
class Consolidator(object):
    
    def consolidate(self, irphar: IRPHAR, source: Source):
        """Adds the missing information from the source to the IRPHAR.
        
        Parameters
        ----------
            irphar: IRPHAR
                Holds the IRPHAR data.
            source: Source
                Holds extra information to be added.
            
        Returns
        -------
        IRPHAR
            An IRPHAR with added data.
        """
        partition1, partition2 = self._partition_irphar(irphar, source)
        
        consolidated_irphar_df = self.add_information(partition1, source)
        
        result = partition2.join(consolidated_irphar_df)
        return IRPHAR(result.reset_index())
        
    def _partition_irphar(self, irphar, source):
        """Partition IRPHAR dataframe into two dataframe. The 
        parition is done columns wise. The first partition contains
        IRPHAR table with the columns shared with new Source. The 
        second partition contains the remaining columns, so that
        the algorithme will reconstruct the original passed IRPHAR
        with added information.
        
        Parameters
        ----------
            irphar: IRPHAR
                Holds the IRPHAR data.
            source: Source
                Holds extra information to be added.
        
        Returns
        -------
        IRPHAR
            A tuple with the two partitions and the join keys as index.
        """
        key = source.join_keys
        irphar_columns = set(irphar.df.columns)
        
        # Columns where new information will be added
        columns_to_keep = source.shared_columns
        # Partition 1 contains parts that will be consolidated
        partition1 = irphar.df[columns_to_keep].copy().set_index(key)
        
        # Partition 2 hold the remaining information
        partition2_columns = list(irphar_columns.difference(set(columns_to_keep)))
        partition2_columns.extend(key)
        partition2 = irphar.df[partition2_columns].copy().set_index(key)
        
        return (partition1, partition2)
        
    def add_information(self, partition1, source):
        """Adds the missing information in partition1 from source.
        
        Parameters
        ----------
            partition1: DataFrame that contains IRPHAR with missing information.
            source: DataFrame with external information.
        
        Returns
        -------
            A DataFrame with the added information. This DataFrame has the
            exact same shape as partition1.
        
        """
        included_drugs = set(source.df.PHA_CIP_C13)
        non_included_drugs = set(partition1.index).difference(included_drugs)
        
        df_part1 = partition1.loc[included_drugs].copy().sort_index()
        df_part2 = partition1.loc[non_included_drugs].copy()
        
        df2 = source.df[source.shared_columns]
        
        df2 = df2.set_index("PHA_CIP_C13").sort_index()
        nb_missing_values = df_part1.isnull().sum().sum()
        print("Number of missing values in IRPHARA table that are",
              "available in the new source",
              "table are {}".format(nb_missing_values))
        
        #Ensure that they have the exact same format
        assert (df_part1.index == df2.index).all()
        assert (df_part1.columns == df2.columns).all()
        
        df3 = df_part1.fillna(df2)
        nb_added_values = nb_missing_values - df3.isnull().sum().sum()
        print("{} values have been added to IRPHAR".format(nb_added_values))
        df4 = pd.concat([df3, df_part2])
        
        assert len(partition1) == len(df4)
        return df4


# IRPHAR

In [6]:
ir_pha_r = pd.read_csv("../src/main/resources/IR_PHA_R/IR_PHA_R_original.csv", sep=";", encoding="latin-1")
irphar = IRPHAR(ir_pha_r)

# Consolidator 

In [7]:
consolidator = Consolidator()

# HTA data

In [8]:
hta_table = pd.read_excel("../src/main/resources/IR_PHA_R/Sources/med_cardio_01aou17.xls")

columns_mapping = {
    "code_eph": "PHA_EPH_COD",
    "CODATC2": "PHA_ATC_C03",
    "classe_eph": "PHA_EPH_LIB_DSES",
    "cip13": "PHA_CIP_C13",
    "nom_court": "PHA_MED_NOM",
    "code_atc": "PHA_ATC_C07",
    "LIBATC2": "PHA_ATC_L03",
    "top_hta": "PHA_HTA_TOP",
    "debut_rembt": "PHA_INS_DTD",
    "CIP7": "PHA_PRS_IDE",
    "top_gdcond": "PHA_CND_TOP"
}

hta_source = Source(hta_table, columns_mapping)
irphar = consolidator.consolidate(irphar, hta_source)

Number of missing values in IRPHARA table that are available in the new source table are 238
238 values have been added to IRPHAR


# Antidépresseurs

In [9]:
antidepresseurs = pd.read_excel("../src/main/resources/IR_PHA_R/Sources/antidepresseurs_10jan17.xls")

columns_mapping = {
    "cip13": "PHA_CIP_C13",
    "nom_court": "PHA_MED_NOM",
    "code_eph": "PHA_EPH_COD",
    "classe_eph": "PHA_EPH_LIB_DSES",
    "code_atc": "PHA_ATC_C07",
    "classe_atc": "PHA_ATC_L07",
    "CODATC2": "PHA_ATC_C03",
    "LIBATC2": "PHA_ATC_L03",
    "CIP7": "PHA_PRS_IDE",
}

antidepresseur = Source(antidepresseurs, columns_mapping)
irphar = consolidator.consolidate(irphar, antidepresseur)

Number of missing values in IRPHARA table that are available in the new source table are 0
0 values have been added to IRPHAR


#  Anxiolytiques

In [10]:
anxiolytiques = pd.read_excel("../src/main/resources/IR_PHA_R/Sources/anxiohypno_10jan17.xls")

columns_mapping = {
    "cip13": "PHA_CIP_C13",
    "nom_court": "PHA_MED_NOM",
    "code_eph": "PHA_EPH_COD",
    "classe_eph": "PHA_EPH_LIB_DSES",
    "code_atc": "PHA_ATC_C07",
    "classe_atc": "PHA_ATC_L07",
    "CIP7": "PHA_PRS_IDE",
}

anxio = Source(anxiolytiques, columns_mapping)

consolidator = Consolidator()
irphar = consolidator.consolidate(irphar, anxio)

Number of missing values in IRPHARA table that are available in the new source table are 0
0 values have been added to IRPHAR


# Neuroleptiques

In [11]:
neuroleptiques = pd.read_excel("../src/main/resources/IR_PHA_R/Sources/neuroleptiques_10jan17.xls")

columns_mapping = {
    "cip13": "PHA_CIP_C13",
    "nom_court": "PHA_MED_NOM",
    "code_eph": "PHA_EPH_COD",
    "classe_eph": "PHA_EPH_LIB_DSES",
    "code_atc": "PHA_ATC_C07",
    "classe_atc": "PHA_ATC_L07",
    "CODATC2": "PHA_ATC_C03",
    "LIBATC2": "PHA_ATC_L03",
    "CIP7": "PHA_PRS_IDE",
}

neuro = Source(neuroleptiques, columns_mapping)
irphar = consolidator.consolidate(irphar, neuro)

Number of missing values in IRPHARA table that are available in the new source table are 0
0 values have been added to IRPHAR


# HTA II 

In [9]:
hta2 = pd.read_excel("../src/main/resources/IR_PHA_R/Sources/meedicaments_hta_10jan17.xls")

columns_mapping = {
    "code_eph": "PHA_EPH_COD",
    "CODATC2": "PHA_ATC_C03",
    "classe_eph": "PHA_EPH_LIB_DSES",
    "cip13": "PHA_CIP_C13",
    "nom_court": "PHA_MED_NOM",
    "code_atc": "PHA_ATC_C07",
    "LIBATC2": "PHA_ATC_L03",
    "top_hta": "PHA_HTA_TOP",
    "CIP7": "PHA_PRS_IDE",
    "top_gdcond": "PHA_CND_TOP"
}

hta_source2 = Source(hta2, columns_mapping)
irphar = consolidator.consolidate(irphar, hta_source2)

Number of missing values in IRPHARA table that are available in the new source table are 0
0 values have been added to IRPHAR


# Store new IRPHAR 

In [12]:
irphar.df.to_csv("../src/main/resources/IR_PHA_R/IR_PHA_R.csv", encoding="utf-8", sep=";")