### Read data

In [1]:
IRPHAR = spark.read.option("header", True).option("delimiter", ";").option("inferSchema", True).csv("D:/firas/fallscripts/new/ir_pha_r.csv")
reduced_IRPHAR = IRPHAR.select("PHA_ATC_C07", "PHA_CIP_C13", "PHA_NOM_PA")
reduced_IRPHAR.show(5)

+-----------+-------------+------------+
|PHA_ATC_C07|  PHA_CIP_C13|  PHA_NOM_PA|
+-----------+-------------+------------+
|    N05CF01|3400935482471|   ZOPICLONE|
|    N05CF02|3400934658570|    ZOLPIDEM|
|    N05CF02|3400936450516|    ZOLPIDEM|
|    N05CF01|3400935877734|   ZOPICLONE|
|    N05CD06|3400934966743|LORMETAZEPAM|
+-----------+-------------+------------+
only showing top 5 rows



###  Define classes

In [2]:
names = ["Antidepresseurs_Tricycliques", "Antidepresseurs_ISRS", "Antidepresseurs_ISRSN", "Antidepresseurs_IMAO_AB",
         "Antidepresseurs_IMAO_A", "Antidepresseurs_autres",
         "Antihypertenseurs_SARTANS", "Antihypertenseurs_IEC", "Antihypertenseurs_Diuretiques",
        "Antihypertenseurs_Betabloquants", "Antihypertenseurs_Inhibiteurs_calciques",
        "Antihypertenseurs_Autres", 
         "Hypnotiques_Benzodiazepine_anxiolytique", "Hypnotiques_Autre_anxiolytique",
        "Hypnotiques_Benzodiazepine_hypnotique", "Hypnotiques_Autre_hypnotique",
        "Neuroleptiques_Neuroleptiques_Atypiques", "Neuroleptiques_Autre_neuroleptique" ]

ATCCodes = [["N06AA"], ["N06AB"], ["N06AX11", "N06AX16", "N06AX17", "N06AX21", "N06AX26"],
            ["N06AF"], ["N06AG"], ["N06AX03", "N06AX09", "N06AX14", "N06AX22", "N06AA06"],
           ["C09C", "C09D"], ["C09A", "C09B"], ["C03"], ["C07"], ["C08"], ["C02", "C09XA", "C10BX03"],
           ["N05BA"], ["N05BB", "N05BC", "N05BE", "N05BX"], ["N05CD"], 
            ["N05CF", "N05BC51", "N05CM11", "N05CM16", "N05CX"], ["N05A"],
           ["N05AA", "N05AH02", "N05AH03", "N05AL05", "N05AX08", "N05AX12"]]

ATCExceptions = [ ["N06AA06"], [], [], [], [], [], [], [], [], [], [], [], [], ["N05BC51"], ["N05CD08"],
                 [], ["N05AL06", "N05AN01", "N05AA", "N05AH02", "N05AH03", "N05AL05", "N05AX08", "N05AX12", "N05AA07"], [] ]
    


###  Add utilities

In [3]:
#gets the pharmaceutical family of a drug with code ATC (atc) given ATCCodes, ATCExceptions, names 
def get_family(ATCCodes, ATCExceptions, names, atc):
    print(atc)
    res = [j for j, x in enumerate(ATCCodes) if(len([y for y in x if(str(atc).startswith(y))])>0)]
    family = ""
    for index in res:
        # test if the atc code is an exception
        is_exception = False
        res = [x for x in ATCExceptions[index] if(x == atc)]
        if(len(res)>0):
            is_exception = True
  
        if(not is_exception):
            family =  family + "_" + names[index]
    return family

#gets the pharmaceutic family of a drug given it's atc code
def udf_family(ATCCodes, ATCExceptions, names):
    return udf(lambda pattern: get_family(ATCCodes, ATCExceptions, names, pattern))
    

from pyspark.sql.functions import udf, StringType
family_udf = udf(get_family, StringType())


###  Transform data

In [4]:
data = reduced_IRPHAR.withColumn("pharmaceutic_family", udf_family(ATCCodes, ATCExceptions, names)(reduced_IRPHAR.PHA_ATC_C07))

In [19]:
molecules = data.filter(data.pharmaceutic_family != "")
from pyspark.sql.functions import split, trim, explode

molecules = molecules.withColumn("molecule", split(molecules.PHA_NOM_PA," ET |\+"))
molecules = molecules.withColumn("molecule", explode('molecule'))

# molecules = molecules.withColumn("molecule", split(molecules.molecules," ET "))
# molecules = molecules.withColumn("molecule", explode('molecule'))

molecules = molecules.withColumn("molecule", trim(molecules.molecule))
molecules = molecules.withColumn("pharmaceutic_family", molecules.pharmaceutic_family.substr(2, 100))
molecules = molecules.drop_duplicates(["pharmaceutic_family", "molecule"])
molecules = molecules.withColumn("therapeutic", split(molecules.pharmaceutic_family, "_")[0])

In [20]:
drugs_classified = molecules.select("PHA_CIP_C13", "PHA_ATC_C07", "therapeutic", "pharmaceutic_family", "molecule")
drugs_classified.show(20)

+-------------+-----------+-----------------+--------------------+-------------------+
|  PHA_CIP_C13|PHA_ATC_C07|      therapeutic| pharmaceutic_family|           molecule|
+-------------+-----------+-----------------+--------------------+-------------------+
|3400930573198|    N06AA09|  Antidepresseurs|Antidepresseurs_T...|      AMITRIPTYLINE|
|3400939709864|    C09BA06|Antihypertenseurs|Antihypertenseurs...|          QUINAPRIL|
|3400931300458|    N05AD05|   Neuroleptiques|Neuroleptiques_Ne...|        PIPAMPERONE|
|3400939433059|    N06AX22|  Antidepresseurs|Antidepresseurs_a...|        AGOMELATINE|
|3400931037460|    C02LA01|Antihypertenseurs|Antihypertenseurs...|BENDROFLUMETHIAZIDE|
|3400932606702|    N05BA21|      Hypnotiques|Hypnotiques_Benzo...|        CLOTIAZEPAM|
|3400930777992|    N05CX01|      Hypnotiques|Hypnotiques_Autre...|        MEPROBAMATE|
|3400931507284|      N05CX|      Hypnotiques|Hypnotiques_Autre...|     ACEPROMETAZINE|
|3400932531097|    N05BA01|      Hypnotique

In [21]:
drugs_classified.count()

192

###  write data

In [22]:
drugs_classified.coalesce(1).write.option("delimiter", ";").option("header", True).csv("D:/firas/fallscripts/new/drugs_classification5")