In [1]:
import pandas as pd
import numpy as np
import matplotlib

# IR_PHA_R

The goal of this notebook is to explore a bit the value table IR_PHA_R.
In particular, we are interested in 4 classes of drugs :
- Antidépresseurs
- Neuroleptiques
- Anxiolytiques
- HTA

For each class, we want to answer different problematics :

- Do we have all the drugs corresponding to the provided description ?
- What are the missing elements and why ?

Also we need to check the consistency between a reference csv version of this value table and the parquet file for each project.

## Reading data

- read data from IR_PHA_R file (updated August 2017)
- read data files for each drug family provided by DEPP department at CNAM

In [2]:
IRPHAR = spark.read.option("header", True).option("delimiter", ";").option("inferSchema", True).csv("D:/firas/fallscripts/ir_pha_r.csv")

In [3]:
AntidpresseursCnam = spark.read.option("header", "true").option("delimiter", ";").csv("D:/firas/fallscripts/csv/antidepresseurs_10jan17.csv")
AnxiolyiqueCnam = spark.read.option("header", True).option("delimiter", ";").option("inferSchema", True).csv("D:/firas/fallscripts/csv/anxiohypno.csv")
NeuroleptiquesCnam = spark.read.option("header", True).option("delimiter", ";").option("inferSchema", True).csv("D:/firas/fallscripts/csv/neuroleptiques.csv")
HTACnam = spark.read.option("header", "true").option("delimiter", ";").csv("D:/firas/fallscripts/csv/hta.csv")
CardioDepp = spark.read.option("header", "true").option("delimiter", ";").option("inferScema", True).csv("D:/firas/fallscripts/csv/med_cardio_01aou17.csv")

## Families definitions

In [45]:
families = ["Antidepresseurs", "Neuroleptiques", "Anxiolyique", "HTA"]
definitions = [["N06A"], ["N05A"], ["N05CD", "N05CF", "N05CX", "N05B", "N05CM11", "N05CM11", "N05CM16"]]
ATCStartexceptions = [[], [], []]
CIPexceptions = [["3333802"], ["3289633"], [] ]
ATCexceptions = [[], ["N05AL06", "N05AN01"], ["N05CD08"]]
#["N05CDF", "N05CB", "N05CC", "N05CA", "N05CH", "N05CE", "N05CX", "N05CM"]

## Antidépresseurs

### Definition :

- The "antidepresseur" family corresponds to the class ATC3 N06A
- There are some exceptions like levotonione which CIP code is 3333802
- We can find this family by filtering on the PHA_ATC_C07 column

## Comparing Antidepresseurs family

In [49]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import isnan, count, when, col
test_udf = udf(lambda pattern, ls: True if any(ls).startswith(pattern) else False)

i = 0
AntidepresseursTable = IRPHAR.filter((IRPHAR.PHA_ATC_C07.substr(0,4).alias("ATC").isin(definitions[i])) &
                                                         (IRPHAR.PHA_ATC_C07.isin(ATCexceptions[i]) == False) &
                                                         (IRPHAR.PHA_PRS_IDE.isin(CIPexceptions[i]) == False) &
                                                         (IRPHAR.PHA_ATC_C07.substr(0,5).isin(ATCStartexceptions[i]) == False) )
print("Statistics for", families[i], "extracted from the latest IR_PHA_R ")
print("number of lines :", AntidepresseursTable.count())
print("Number of different class codes", AntidepresseursTable.select("PHA_ATC_C03").distinct().count())
print("Number of different class labels", AntidepresseursTable.select("PHA_ATC_L03").distinct().count())
print("Number of different sub classes labels ", AntidepresseursTable.select("PHA_ATC_L07").distinct().count())
print("Number of different CIP13  labels ", AntidepresseursTable.select("PHA_CIP_C13").distinct().count())
print("Number of different group names", AntidepresseursTable.select("PHA_GRS_NOM").distinct().count())
print("Number of different labels for medical-economic class", AntidepresseursTable.select("PHA_EPH_LIB_DSES").distinct().count())
print("Number of different labels for the commercial drugs", AntidepresseursTable.select("PHA_MED_COM").distinct().count())
AntidepresseursTable.select("PHA_FRM_LIB").distinct().show()
AntidepresseursTable.select("PHA_DOS_UNT_DSES").distinct().show()


Statistics for Antidepresseurs extracted from the latest IR_PHA_R 
number of lines : 544
Number of different class codes 1
Number of different class labels 1
Number of different sub classes labels  30
Number of different CIP13  labels  544
Number of different group names 33
Number of different labels for medical-economic class 4
Number of different labels for the commercial drugs 233
+-----------+
|PHA_FRM_LIB|
+-----------+
|     GELULE|
| SUSPENSION|
|       null|
|   COMPRIME|
|   SOLUTION|
+-----------+

+----------------+
|PHA_DOS_UNT_DSES|
+----------------+
|        MG/10 ML|
|           MG/ML|
|         MG/5 ML|
|        G/100 ML|
|         MG/2 ML|
|              MG|
|         MG/1 ML|
+----------------+



In [7]:
print("statistics from the CNAM Antidepresseurs file extracted by Jeremy")
print("number of lines : ", AntidpresseursCnam.count())
print("Distinct values for antidepresseurs extracted by cnam:")
for col in AntidpresseursCnam.columns:
    print(col, AntidpresseursCnam.select(col).distinct().count())

statistics from the CNAM Antidepresseurs file extracted by Jeremy
number of lines :  543
Distinct values for antidepresseurs extracted by cnam:
cip13 543
CIP7 543
nom_court 516
code_eph 3
classe_eph 3
code_atc 30
classe_atc 30
CODATC2 1
LIBATC2 1
groupe 6


In [8]:
print("The number of drugs in : \n IR_PHA_R =", AntidepresseursTable.select("PHA_CIP_C13").distinct().count(), \
      "\n Cnam data (Jeremy) =", AntidpresseursCnam.select("cip13").distinct().count())

The number of drugs in : 
 IR_PHA_R = 544 
 Cnam data (Jeremy) = 543


In [9]:
cipIR_PHA_R = AntidepresseursTable.select("PHA_CIP_C13")
cnamCip = AntidpresseursCnam.select("cip13")
print("IR_PHA_R - Jeremy")
cipIR_PHA_R.subtract(cnamCip).show()

print("Jeremy - IR_PHA_R")
cnamCip.subtract(cipIR_PHA_R).show()


IR_PHA_R - Jeremy
+-------------+
|  PHA_CIP_C13|
+-------------+
|3400930078921|
+-------------+

Jeremy - IR_PHA_R
+-----+
|cip13|
+-----+
+-----+



In [12]:
print("The drug missing for Antidepresseurs in Jeremy data : ")
pd.DataFrame(data = AntidepresseursTable.filter(AntidepresseursTable.PHA_CIP_C13 == 3400930078921).head(2), columns = AntidepresseursTable.columns )

The drug missing for Antidepresseurs in Jeremy data : 


Unnamed: 0,PHA_AGE_DTD,PHA_AGE_MAX,PHA_AGE_MIN,PHA_AST_TOP,PHA_ATC_C03,PHA_ATC_C07,PHA_ATC_L03,PHA_ATC_L07,PHA_CAR_TOP,PHA_CIP_C13,...,PHA_PRE_IND,PHA_PRI_UND,PHA_PRI_UNI,PHA_PRS_IDE,PHA_RGE_C07,PHA_RGE_C13,PHA_SEX_DTD,PHA_TAR_DAT,PHA_TAU_COD,PHA_UNT_NBR_DSES
0,,,,0,N06,N06AX16,PSYCHOANALEPTIQUES,VENLAFAXINE,0,3400930078921,...,N,19.0,560.0,3007892,,,,20170413,65,30


> ### Conclusion
> it makes sense since that the drug is missing in Jeremy's data since the file date is 10/01/17 and the drug was commecialized after that

## Neuroleptiques family

### Definition :
- The "Neuroleptique" family corresponds to the class ATC3 N05A
- We can find this family by filtering on the PHA_ATC_C07 column
- We should remove atypic neuroleptics with class N05AL06 or N05AN01
- We should also remove the CHLORPROETHAZINE which CIP code is 3289633

In [48]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import isnan, count, when, col
test_udf = udf(lambda pattern, ls: True if any(ls).startswith(pattern) else False)

i = 1
dataTable = IRPHAR.filter((IRPHAR.PHA_ATC_C07.substr(0,4).alias("ATC").isin(definitions[i])) &
                                                         (IRPHAR.PHA_ATC_C07.isin(ATCexceptions[i]) == False) &
                                                         (IRPHAR.PHA_PRS_IDE.isin(CIPexceptions[i]) == False) &
                                                         (IRPHAR.PHA_ATC_C07.substr(0,5).isin(ATCStartexceptions[i]) == False) )
print("Statistics for", families[i], "extracted from the latest IR_PHA_R")
print("number of lines :", dataTable.count())
print("Number of different class codes", dataTable.select("PHA_ATC_C03").distinct().count())
print("Number of different class labels", dataTable.select("PHA_ATC_L03").distinct().count())
print("Number of different sub classes labels ", dataTable.select("PHA_ATC_L07").distinct().count())
print("Number of different CIP13  labels ", dataTable.select("PHA_CIP_C13").distinct().count())
print("Number of different labels for medical-economic class", dataTable.select("PHA_EPH_LIB_DSES").distinct().count())
print("Number of different labels for the commercial drugs", dataTable.select("PHA_MED_COM").distinct().count())
dataTable.select("PHA_FRM_LIB").distinct().show()
dataTable.select("PHA_DOS_UNT_DSES").distinct().show()

print("Jeremy data for Neurolpetiques family")
print("number of lines : ", NeuroleptiquesCnam.count())
print("Distinct values :")
for col in NeuroleptiquesCnam.columns:
    print(col, NeuroleptiquesCnam.select(col).distinct().count())
print("\n")

drugCount1 = dataTable.select("PHA_CIP_C13").distinct().count()
drugCount2 = NeuroleptiquesCnam.select("cip13").distinct().count()
print("The number of drugs in : \n IR_PHA_R =", drugCount1 ,"\n Cnam data =", drugCount2,"\n diff =", drugCount1 - drugCount2)

Statistics for Neuroleptiques extracted from the latest IR_PHA_R
number of lines : 607
Number of different class codes 1
Number of different class labels 1
Number of different sub classes labels  29
Number of different CIP13  labels  607
Number of different labels for medical-economic class 3
Number of different labels for the commercial drugs 141
+-----------+
|PHA_FRM_LIB|
+-----------+
|     GELULE|
| SUSPENSION|
|       null|
|   COMPRIME|
|     POUDRE|
|   SOLUTION|
+-----------+

+----------------+
|PHA_DOS_UNT_DSES|
+----------------+
|        MG/10 ML|
|           MG/ML|
|         MG/5 ML|
|        G/100 ML|
|         MG/2 ML|
|              MG|
|               %|
|         MG/1 ML|
|         MG/4 ML|
+----------------+

Jeremy data for Neurolpetiques family
number of lines :  591
Distinct values :
cip13 591
CIP7 591
nom_court 543
code_eph 3
classe_eph 3
code_atc 29
classe_atc 29
CODATC2 1
LIBATC2 1
top_atyp 2


The number of drugs in : 
 IR_PHA_R = 607 
 Cnam data = 591 
 diff

In [17]:
cipIR_PHA_R = dataTable.select("PHA_CIP_C13")
cnamCip = NeuroleptiquesCnam.select("cip13")
print("IR_PHA_R - Jeremy")
print(cipIR_PHA_R.subtract(cnamCip).count())

print("Jeremy - IR_PHA_R")
print(cnamCip.subtract(cipIR_PHA_R).count())

IR_PHA_R - Jeremy
16
Jeremy - IR_PHA_R
0


In [18]:
missingCIP = cipIR_PHA_R.subtract(cnamCip)
listCIP = [i for i in missingCIP.rdd.flatMap(lambda x: x).collect()]
print("The drugs missing for Neuroleptiques in CNAM data : ")
pd.DataFrame(data = dataTable.filter(dataTable.PHA_CIP_C13.isin(listCIP) ).head(len(listCIP)), columns = dataTable.columns )

The drugs missing for Neuroleptiques in CNAM data : 


Unnamed: 0,PHA_AGE_DTD,PHA_AGE_MAX,PHA_AGE_MIN,PHA_AST_TOP,PHA_ATC_C03,PHA_ATC_C07,PHA_ATC_L03,PHA_ATC_L07,PHA_CAR_TOP,PHA_CIP_C13,...,PHA_PRE_IND,PHA_PRI_UND,PHA_PRI_UNI,PHA_PRS_IDE,PHA_RGE_C07,PHA_RGE_C13,PHA_SEX_DTD,PHA_TAR_DAT,PHA_TAU_COD,PHA_UNT_NBR_DSES
0,,,,0,N05,N05AX12,PSYCHOLEPTIQUES,ARIPIPRAZOLE,0,3400930027103,...,N,143.0,4000.0,3002710,,,,20171001,65,28
1,,,,0,N05,N05AX12,PSYCHOLEPTIQUES,ARIPIPRAZOLE,0,3400930026991,...,N,143.0,4000.0,3002699,,,,20171001,65,28
2,,,,0,N05,N05AX12,PSYCHOLEPTIQUES,ARIPIPRAZOLE,0,3400930083758,...,N,78.0,2190.0,3008375,,,,20180102,65,28
3,,,,0,N05,N05AX12,PSYCHOLEPTIQUES,ARIPIPRAZOLE,0,3400930083611,...,N,78.0,2190.0,3008361,,,,20180102,65,28
4,,,,0,N05,N05AX13,PSYCHOLEPTIQUES,PALIPERIDONE,0,3400930059838,...,N,81927.0,81927.0,3005983,,,,20170131,65,1
5,,,,0,N05,N05AX13,PSYCHOLEPTIQUES,PALIPERIDONE,0,3400930059821,...,N,67271.0,67271.0,3005982,,,,20170131,65,1
6,,,,0,N05,N05AX13,PSYCHOLEPTIQUES,PALIPERIDONE,0,3400930059814,...,N,52627.0,52627.0,3005981,,,,20170131,65,1
7,,,,0,N05,N05AX13,PSYCHOLEPTIQUES,PALIPERIDONE,0,3400930059845,...,N,120987.0,120987.0,3005984,,,,20170131,65,1
8,,,,0,N05,N05AX12,PSYCHOLEPTIQUES,ARIPIPRAZOLE,0,3400930083796,...,N,78.0,2190.0,3008379,,,,20180102,65,28
9,,,,0,N05,N05AX12,PSYCHOLEPTIQUES,ARIPIPRAZOLE,0,3400930079485,...,N,78.0,2190.0,3007948,,,,20180102,65,28


> ### Conclusion
> it makes sense since that the drug is missing in Jeremy's data since the file date is 10/01/17 and the drug was commecialized after that

## Comparing Anxiolytiques

### Definition :
- The "Anxiolytique" family corresponds to the class ATC3 N05B, N05CD and N05CF
- We can find this family by filtering on the PHA_ATC_C07 column
- We should also include the following drugs :
    - N05BC51 : Méprobamate en association (Mepronizine)
    - N05CM11 : Bromures (Neurocalcium, Galirene, Calcibronat)
    - N05CM16 : Niaprazine (Nopron)
    - N05CX : Clorazepate Dipotassique + Acepromazine (Noctran)

In [47]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import isnan, count, when, col
test_udf = udf(lambda pattern, ls: True if any(ls).startswith(pattern) else False)

i = 2
dataTable = IRPHAR.filter(
    (
        (IRPHAR.PHA_ATC_C07.alias("ATC").isin(definitions[i])) |
        (IRPHAR.PHA_ATC_C07.substr(0,5).alias("ATC").isin(definitions[i])) |
        (IRPHAR.PHA_ATC_C07.substr(0,4).alias("ATC4").isin(definitions[i]))
    ) &
    (IRPHAR.PHA_ATC_C07.isin(ATCexceptions[i]) == False) &
    (IRPHAR.PHA_PRS_IDE.isin(CIPexceptions[i]) == False) &
    (IRPHAR.PHA_ATC_C07.substr(0,5).isin(ATCStartexceptions[i]) == False)
                         )
print("Statistics for", families[i], "extracted from the latest IR_PHA_R : ")
print("number of lines :", dataTable.count())
print("Number of different class codes", dataTable.select("PHA_ATC_C03").distinct().count())
print("Number of different class labels", dataTable.select("PHA_ATC_L03").distinct().count())
print("Number of different sub classes labels ", dataTable.select("PHA_ATC_L07").distinct().count())
print("Number of different CIP13  labels ", dataTable.select("PHA_CIP_C13").distinct().count())
print("Number of different labels for medical-economic class", dataTable.select("PHA_EPH_LIB_DSES").distinct().count())
print("Number of different labels for the commercial drugs", dataTable.select("PHA_MED_COM").distinct().count())
dataTable.select("PHA_FRM_LIB").distinct().show()
dataTable.select("PHA_DOS_UNT_DSES").distinct().show()

print("Cnam data (Jeremy) for AnxiolyiqueCnam family")
print("number of lines : ", AnxiolyiqueCnam.count())
print("Distinct values :")
for col in AnxiolyiqueCnam.columns:
    print(col, AnxiolyiqueCnam.select(col).distinct().count())
print("\n")

drugCount1 = dataTable.select("PHA_CIP_C13").distinct().count()
drugCount2 = AnxiolyiqueCnam.select("cip13").distinct().count()
print("The number of drugs in : \n IR_PHA_R =", drugCount1 ,"\n Cnam data =", drugCount2,"\n diff =", drugCount1 - drugCount2)

Statistics for Anxiolyique extracted from the latest IR_PHA_R : 
number of lines : 281
Number of different class codes 1
Number of different class labels 1
Number of different sub classes labels  31
Number of different CIP13  labels  281
Number of different labels for medical-economic class 4
Number of different labels for the commercial drugs 130
+-------------+
|  PHA_FRM_LIB|
+-------------+
|       GELULE|
|         null|
|     COMPRIME|
|       POUDRE|
|CAPSULE MOLLE|
|        SIROP|
|     SOLUTION|
|  LYOPHILISAT|
+-------------+

+----------------+
|PHA_DOS_UNT_DSES|
+----------------+
|        MG/10 ML|
|           MG/ML|
|         MG/5 ML|
|        G/100 ML|
|           MG/MG|
|         MG/2 ML|
|              MG|
|          G/5 ML|
|         G/10 ML|
|         G/15 ML|
|               G|
|        MG/MG/MG|
+----------------+

Cnam data (Jeremy) for AnxiolyiqueCnam family
number of lines :  278
Distinct values :
cip13 278
CIP7 278
nom_court 269
code_atc 30
classe_atc 30
code_e

In [32]:
cipIR_PHA_R = dataTable.select("PHA_CIP_C13")
cnamCip = AnxiolyiqueCnam.select("cip13")
print("IR_PHA_R - Jeremy")
print(cipIR_PHA_R.subtract(cnamCip).count())

print("Jeremy - IR_PHA_R")
print(cnamCip.subtract(cipIR_PHA_R).count())



IR_PHA_R - Jeremy
3
Jeremy - IR_PHA_R
0


In [33]:
newcip = dataTable.select("PHA_CIP_C13")
cnamCip = AnxiolyiqueCnam.select("cip13")
missingCIP = newcip.subtract(cnamCip)
listCIP = [i for i in missingCIP.rdd.flatMap(lambda x: x).collect()]
print("The drugs missing for Anxiolytiques in CNAM data : ")
pd.DataFrame(data = dataTable.filter(dataTable.PHA_CIP_C13.isin(listCIP) ).head(43), columns = dataTable.columns )

The drugs missing for Anxiolytiques in CNAM data : 


Unnamed: 0,PHA_AGE_DTD,PHA_AGE_MAX,PHA_AGE_MIN,PHA_AST_TOP,PHA_ATC_C03,PHA_ATC_C07,PHA_ATC_L03,PHA_ATC_L07,PHA_CAR_TOP,PHA_CIP_C13,...,PHA_PRE_IND,PHA_PRI_UND,PHA_PRI_UNI,PHA_PRS_IDE,PHA_RGE_C07,PHA_RGE_C13,PHA_SEX_DTD,PHA_TAR_DAT,PHA_TAU_COD,PHA_UNT_NBR_DSES
0,,,,0,N05,N05BE01,PSYCHOLEPTIQUES,BUSPIRONE,0,3400934802652,...,N,18.0,351.0,3480265,,,,20170314,65,20
1,,,,0,N05,N05BB01,PSYCHOLEPTIQUES,HYDROXYZINE,0,3400930081808,...,N,0.0,153.0,3008180,,,,20170330,65,30
2,,,,0,N05,N05CX01,PSYCHOLEPTIQUES,MEPROBAMATE + VALERIANE,0,3400930777992,...,N,,188.0,3077799,,,,20020101,35,40


- For the first two missing drugs it's normal because they were commercialized after the creation of Jeremy's file.
- For the last one, we have to make sure if we integrate or not

## HTA

## Definition

- C09C C09D
- C09A C09B + C10BX04
- C09X
- C03 C02LA01
- C07
- C08
- C10BX03
- C02 SAUF C02CA02 et C03XA01


For the HTA drugs, we were provided two files by Jeremy :
- Antihypertension drug family
- Cardiovascular drug famlily <br>
So, we try here to compare these two files with IR_PHA_R

In [48]:
print("Investigating the top_hta colum in IR_PHA_R")
IRPHAR.select("PHA_HTA_TOP").distinct().show()
df = IRPHAR.select("PHA_HTA_TOP").groupBy("PHA_HTA_TOP").count()
print("The column PHA_HTA_TOP contains ", df.filter(df.PHA_HTA_TOP.isNull()).select("count").collect()[0] ,"null values !!!")

Investigating the top_hta colum in IR_PHA_R
+-----------+
|PHA_HTA_TOP|
+-----------+
|       null|
|          1|
|          2|
+-----------+

The column PHA_HTA_TOP contains  Row(count=1017) null values !!!


## Statistics from IR_PHA_R using PHA_HTA_TOP

In [23]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import isnan, count, when, col
test_udf = udf(lambda pattern, ls: True if any(ls).startswith(pattern) else False)


dataTable = IRPHAR.filter((IRPHAR.PHA_HTA_TOP.alias("ATC") == 1) )
print("Statistics for", "HTA", "extracted from the latest IR_PHA_R")
print("number of lines :", dataTable.count())
print("Number of different class codes", dataTable.select("PHA_ATC_C03").distinct().count())
print("Number of different class labels", dataTable.select("PHA_ATC_L03").distinct().count())
print("Number of different sub classes labels ", dataTable.select("PHA_ATC_L07").distinct().count())
print("Number of different CIP13  labels ", dataTable.select("PHA_CIP_C13").distinct().count())
print("Number of different labels for medical-economic class", dataTable.select("PHA_EPH_LIB_DSES").distinct().count())
print("Number of different labels for the commercial drugs", dataTable.select("PHA_MED_COM").distinct().count())
dataTable.select("PHA_FRM_LIB").distinct().show()
dataTable.select("PHA_DOS_UNT_DSES").distinct().show()



Statistics for HTA extracted from the latest IR_PHA_R
number of lines : 4115
Number of different class codes 6
Number of different class labels 6
Number of different sub classes labels  107
Number of different CIP13  labels  4115
Number of different labels for medical-economic class 25
Number of different labels for the commercial drugs 905
+-------------+
|  PHA_FRM_LIB|
+-------------+
|       GELULE|
|         null|
|     COMPRIME|
|CAPSULE MOLLE|
|     SOLUTION|
+-------------+

+----------------+
|PHA_DOS_UNT_DSES|
+----------------+
|           MG/ML|
|         MG/5 ML|
|       MG/2,5 MG|
|           MG/MG|
|         MG/2 ML|
|              MG|
|        MG/MG/MG|
|         MG/4 ML|
+----------------+



## Statistics from HTA file

In [51]:
print("statistics from the CNAM HTA file")
print("number of lines : ", HTACnam.count())
print("Distinct values for antidepresseurs extracted by cnam:")
for col in HTACnam.columns:
    print(col, HTACnam.select(col).distinct().count())

statistics from the CNAM HTA file
number of lines :  4031
Distinct values for antidepresseurs extracted by cnam:
cip13 4031
CIP7 4031
nom_court 3828
code_eph 20
classe_eph 20
code_atc 106
classe_atc 106
CODATC2 6
LIBATC2 6
CODE_ATC3 23
CODE_ATC4 33
top_hta 1
top_gdcond 2
SARTANS 2
IEC 2
AUTSRA 2
DIUR 3
THIAZ 2
ALDO 2
ANSE 2
BB 2
ICA 2
AUTHTA 2
ASSODF 2


## Statistics from Cardiovascular file

This file contains two filters for drugs having HTA effect
- classe_hta
- top_hta

In [6]:
from pyspark.sql.functions import to_date, unix_timestamp, from_unixtime, from_utc_timestamp
from pyspark.sql.types import DateType
func = udf(lambda x: datetime.strptime(x, "%d/%m/%Y"), DateType())
CardioDepp = CardioDepp.withColumn("date", when(CardioDepp.debut_rembt.isNull(), None)
        .otherwise(to_date(from_unixtime(unix_timestamp(CardioDepp.debut_rembt, "dd/MM/yyyy"),format = "yyyy-MM-dd"))))


In [7]:
import datetime
dateLimite = datetime.datetime(2017, 8, 1)
listeHTA_cardio_c =  CardioDepp.filter((CardioDepp.CLASSE_HTA == "1") & (CardioDepp.date <= dateLimite))
print("Number of HTA drugs with filter CLASSE_HTA : ", listeHTA_cardio_c.count())
listeHTA_cardio_t =  CardioDepp.filter((CardioDepp.top_hta == "1") & (CardioDepp.date <= dateLimite))
print("Number of HTA drugs with filter TOP_HTA : ", listeHTA_cardio_t.count())

Number of HTA drugs with filter CLASSE_HTA :  4431
Number of HTA drugs with filter TOP_HTA :  4133


Let's see what are the differences between the two filters and IR_PHA_R:

In [24]:
IRCip = dataTable.select("PHA_CIP_C13")
cardioCip_c = listeHTA_cardio_c.select("cip13")
cardioCip_t = listeHTA_cardio_t.select("cip13")

print("CLasseHTACip - IR_PHA_R_hta")
print(cardioCip_c.subtract(IRCip).count())

print("Top_HTA - IR_PHA_R_hta")
print(cardioCip_t.subtract(IRCip).count())

print("IR_PHA_R_hta - CLasseHTACip")
print(IRCip.subtract(cardioCip_c).count())

print("IR_PHA_R_hta - Top_HTA")
print(IRCip.subtract(cardioCip_t).count())

CLasseHTACip - IR_PHA_R_hta
316
Top_HTA - IR_PHA_R_hta
18
IR_PHA_R_hta - CLasseHTACip
0
IR_PHA_R_hta - Top_HTA
0


### Missing ATC codes when using PHA_TOP_HTA

In [31]:
codes2 = IRPHAR.filter(IRPHAR.PHA_HTA_TOP == "1").select("PHA_ATC_C07").distinct()
codes  = CardioDepp.filter(CardioDepp.CLASSE_HTA == "1").select("code_atc").distinct()
codes.subtract(codes2).show()

+--------+
|code_atc|
+--------+
| C07AA07|
|    C01B|
| C08CA06|
| C03EB01|
| C02CA02|
| C03DA04|
| C07AG02|
| C03DA02|
| C08EA02|
+--------+



###  Construction of  HTA family with the code definition from IR_PHA_R

In [4]:
families = ["HTA"]
definitions = [["C02", "C03", "C07", "C08", "C09", "C10BX04", "C10BX03"]]
ATCStartexceptions = [[]]
CIPexceptions = [ [] ]
ATCexceptions = [["C02CA02", "C03XA01"]]

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import isnan, count, when, col
test_udf = udf(lambda pattern, ls: True if any(ls).startswith(pattern) else False)

i = 0
HTA_IRPHAR_ATC = IRPHAR.filter(
    ((IRPHAR.PHA_ATC_C07.substr(0,3).alias("ATC").isin(definitions[i])) | (IRPHAR.PHA_ATC_C07.isin(definitions[i]))) &
                                                         (IRPHAR.PHA_ATC_C07.isin(ATCexceptions[i]) == False) &
                                                         (IRPHAR.PHA_PRS_IDE.isin(CIPexceptions[i]) == False) &
                                                         (IRPHAR.PHA_ATC_C07.substr(0,5).isin(ATCStartexceptions[i]) == False) 
                              )
print("Statistics for", families[i], "extracted from the latest IR_PHA_R ")
print("number of lines :", HTA_IRPHAR_ATC.count())
print("Number of different class codes", HTA_IRPHAR_ATC.select("PHA_ATC_C03").distinct().count())
print("Number of different class labels", HTA_IRPHAR_ATC.select("PHA_ATC_L03").distinct().count())
print("Number of different CIP13  labels ", HTA_IRPHAR_ATC.select("PHA_CIP_C13").distinct().count())

Statistics for HTA extracted from the latest IR_PHA_R 
number of lines : 4402
Number of different class codes 6
Number of different class labels 6
Number of different CIP13  labels  4402


We obtain 4402 drugs with the code definition instead of 4115 with the PHA_HTA_TOP filter.
- we're closer to 4431 (number of drugs in Jeremy file with the classe_hta filter) <br>
=> Next, we investigate the missing drugs

In [8]:
IRCip = HTA_IRPHAR_ATC.select("PHA_CIP_C13")
cardioCip_c = listeHTA_cardio_c.select("cip13")
cardioCip_t = listeHTA_cardio_t.select("cip13")
print("missing code CIP  :")
print("cardioCip_c - HTA_IRPHAR_CIP")
print(cardioCip_c.subtract(IRCip).count())

print("cardioCip_t - HTA_IRPHAR_CIP")
print(cardioCip_t.subtract(IRCip).count())

missing code CIP  :
cardioCip_c - HTA_IRPHAR_CIP
29
cardioCip_t - HTA_IRPHAR_CIP
18


In [9]:
print("missing code CIP  :")
print("HTA_IRPHAR_CIP - cardioCip_c")
print(IRCip.subtract(cardioCip_c).count())

print("HTA_IRPHAR_CIP - cardioCip_t")
print(IRCip.subtract(cardioCip_t).count())

missing code CIP  :
HTA_IRPHAR_CIP - cardioCip_c
0
HTA_IRPHAR_CIP - cardioCip_t
287


In [10]:
missingCIP = cardioCip_c.subtract(IRCip)
listCIP = [i for i in missingCIP.rdd.flatMap(lambda x: x).collect()]
df = IRPHAR.filter(IRPHAR.PHA_CIP_C13.isin(listCIP))
df.select("PHA_CIP_C13", "PHA_ATC_C07").show(30)

+-------------+-----------+
|  PHA_CIP_C13|PHA_ATC_C07|
+-------------+-----------+
|3400949001736|       null|
|3400949001774|       null|
|3400949001750|       null|
|3400949001811|       null|
|3400949001798|       null|
|3400949001927|       null|
|3400932926060|    C02CA02|
|3400949001743|       null|
|3400949001781|       null|
|3400949001767|       null|
|3400949001965|       null|
|3400949001941|       null|
|3400949001934|       null|
|3400949001828|       null|
|3400949001804|       null|
|3400937255318|       null|
|3400937254366|       null|
|3400937165266|       null|
|3400937164085|       null|
|3400930058046|       null|
|3400930057988|       null|
|3400930057926|       null|
|3400930058206|       null|
|3400930058145|       null|
|3400930058091|       null|
|3400930058336|       null|
|3400930058268|       null|
|3400939637914|    C02CA02|
|3400931415404|       C01B|
+-------------+-----------+



In [17]:
listeHTA_cardio_c.filter(listeHTA_cardio_c.cip13.isin(listCIP)).select("cip13", "code_atc").show(29)

+-------------+--------+
|        cip13|code_atc|
+-------------+--------+
|3400931415404|    C01B|
|3400932926060| C02CA02|
|3400939637914| C02CA02|
|3400930057926| C03DA04|
|3400930057988| C03DA04|
|3400930058046| C03DA04|
|3400930058091| C03DA04|
|3400930058145| C03DA04|
|3400930058206| C03DA04|
|3400930058268| C03DA04|
|3400930058336| C03DA04|
|3400937164085| C09AA05|
|3400937165266| C09AA05|
|3400937254366| C09AA05|
|3400937255318| C09AA05|
|3400949001736| C09BB04|
|3400949001743| C09BB04|
|3400949001750| C09BB04|
|3400949001767| C09BB04|
|3400949001774| C09BB04|
|3400949001781| C09BB04|
|3400949001798| C09BB04|
|3400949001804| C09BB04|
|3400949001811| C09BB04|
|3400949001828| C09BB04|
|3400949001927| C09BB04|
|3400949001934| C09BB04|
|3400949001941| C09BB04|
|3400949001965| C09BB04|
+-------------+--------+



###  All the missing element have numm code ATC. Tha's why we miss them except for the code C02CA02 and c01B
=> for these two codes, we have to see with Jeremy if we include them or not ! <br>
=> for the missing ones, the only possible solution is to complete them by hand