# Biomedical Data Analysis - Lab 4

Group members:

- Enrico Maria Marinelli
- Francesco Pio Capoccello
- Juras LukaŠevičius

### Libraries used in practical session:

In [228]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from scipy.stats import pearsonr
import regex as re

## Block I - Data loading and basic description

We load the dataset using the pandas’ load_csv function, remember to specify semicolon (;) as
the separator

In [229]:
df=pd.read_csv("Data/inadvance_synth.csv",sep=";")

In [230]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38416 entries, 0 to 38415
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           38416 non-null  int64  
 1   age                  38416 non-null  int64  
 2   barthel              10218 non-null  float64
 3   charlson             38339 non-null  float64
 4   codidiagingreso      27038 non-null  object 
 5   codservicioreal      38416 non-null  object 
 6   creatinina           30465 non-null  float64
 7   drg                  34776 non-null  float64
 8   estancias            38416 non-null  float64
 9   glucosa              29188 non-null  float64
 10  hematocrito          30114 non-null  float64
 11  leucocitos           30114 non-null  float64
 12  metastatic_tumor     38339 non-null  float64
 13  num_grupoact3_HOSP   17415 non-null  float64
 14  numurgenciasprevias  37864 non-null  float64
 15  potasio              29770 non-null 

### 1. What is the size of the dataframe?

In [231]:
# Drop the "Unnamed" column
df = df.iloc[:,1:]

In [232]:
df.shape

(38416, 21)

### 2. What is the mean age?

In [233]:
mean = df.age.mean()
print(mean)

79.4033215326947


### 3. What is the age standard deviation (std)?

In [234]:
std=df.age.std()
print(std)

8.361737254115944


### 4. Which is the variable with the most amount of missing values? Can you list the name of the variables, sorting them by number of missing values?

In [235]:
nullvalues = df.isnull().sum()
features_sorted = nullvalues.sort_values(ascending=False)
print(features_sorted)

barthel                28198
num_grupoact3_HOSP     21001
proteina_c_reactiva    18175
urea                   14260
codidiagingreso        11378
glucosa                 9228
potasio                 8646
rdw_sd                  8334
rdw_cv                  8334
hematocrito             8302
leucocitos              8302
sodio                   8142
creatinina              7951
drg                     3640
numurgenciasprevias      552
metastatic_tumor          77
charlson                  77
age                        0
estancias                  0
codservicioreal            0
label                      0
dtype: int64


### 5. Name which are the categorical variables.

In [236]:
categorical_variables = df.select_dtypes(include=['object'])
categorical_variables_names = categorical_variables.columns.tolist()
print(categorical_variables_names)

['codidiagingreso', 'codservicioreal']


In [237]:
def find_cat(x):
    if isinstance(x,str):
        pattern = r'[0-9]+.?[0-9]*'
        test = not (re.match(pattern, x))
        # print(f"{x}: {test}")  # Add this line for debugging
        return test
    # print(f"{x}: False")  # Add this line for debugging
    return False

In [238]:
def categorical_enumeration(cat,categorical_variables=categorical_variables, printing=True):
  
  if printing:
    print('Evaluation of categorical variable "' + cat + '"')

  # categories = categorical_variables[categorical_variables[cat].apply(find_cat)]
  if cat == categorical_variables_names[0]:
    categories = categorical_variables[categorical_variables[cat].apply(find_cat)]
  else:
    categories = categorical_variables[cat]
  
  unique, count = np.unique(categories, return_counts=True)

  if printing:
    print('Number of categories for "'+ cat + '":', len(unique))
  
  combined = np.column_stack((unique, count))
  sorted_combined = combined[combined[:, 1].argsort()[::-1]]
  
  if printing:
    for i in sorted_combined:
      if isinstance(i[0],float):
        print("Class %f has %d samples" % (i[0],i[1]))
      else:
        print("Class " + str(i[0]) + " has %d samples" % i[1])
  
  return np.array(sorted_combined)

In [239]:
categories_codidiagingreso = categorical_enumeration(categorical_variables_names[0])

Evaluation of categorical variable "codidiagingreso"
Number of categories for "codidiagingreso": 299
Class HMUR has 507 samples
Class HNEM has 382 samples
Class HCAR has 375 samples
Class V14.8 has 245 samples
Class HMIN has 215 samples
Class HONC has 214 samples
Class V15.82 has 195 samples
Class HNER has 183 samples
Class V14.0 has 181 samples
Class V58.61 has 121 samples
Class V14.1 has 105 samples
Class HHEM has 78 samples
Class M81403 has 76 samples
Class V58.67 has 71 samples
Class HURO has 65 samples
Class V14.6 has 63 samples
Class HMDG has 54 samples
Class V45.01 has 42 samples
Class M80003 has 40 samples
Class V60.6 has 38 samples
Class HCOT has 36 samples
Class V43.3 has 32 samples
Class HNEF has 31 samples
Class V45.81 has 30 samples
Class V46.2 has 29 samples
Class V61.49 has 28 samples
Class V12.54 has 28 samples
Class HNCG has 27 samples
Class E911 has 26 samples
Class V58.66 has 25 samples
Class V10.46 has 25 samples
Class V10.3 has 25 samples
Class HCTO has 25 samples


In [240]:
categories_codservicioreal = categorical_enumeration(categorical_variables_names[1])

Evaluation of categorical variable "codservicioreal"
Number of categories for "codservicioreal": 53
Class HMUR has 8233 samples
Class HNEM has 5422 samples
Class HCAR has 5124 samples
Class HMIN has 3633 samples
Class HNER has 2341 samples
Class HMDG has 2222 samples
Class HONC has 2041 samples
Class HUHP has 1159 samples
Class HURO has 1112 samples
Class HCDG has 1007 samples
Class HCOT has 741 samples
Class HHEM has 546 samples
Class HNEF has 522 samples
Class HCVA has 502 samples
Class HMDH has 472 samples
Class HCEP has 420 samples
Class HNCG has 381 samples
Class HUMI has 351 samples
Class HUEI has 332 samples
Class HCCV has 308 samples
Class HREU has 252 samples
Class HORL has 179 samples
Class HREA has 157 samples
Class HCTO has 154 samples
Class HCIR has 146 samples
Class HECR has 126 samples
Class HCLP has 71 samples
Class HSEP has 60 samples
Class HGIN has 55 samples
Class HUTP has 50 samples
Class HCMX has 49 samples
Class HUML has 44 samples
Class HCPL has 43 samples
Class 

In addition to the existing `codidiagingreso` and `codservicioreal` categorical variables including codes in the format 
```python 
str
```
is `metastatic_tumor`, since its only values are $0$ and $1$.

### 6. Extract the ‘label’ column to another variable. How many positive cases there are? And negatives?


In [241]:
label_var = df['label']
positive = df[label_var==1].shape[0]
negative = df[label_var==0].shape[0]
print(f"Positive cases: {positive}")
print(f"Negative cases: {negative}")

Positive cases: 13431
Negative cases: 24985


## Block II. Data preprocessing

How many samples have each set after the split?

Split the dataset in two: train (80%) and test (20%). Use a seed to allow replication.

In [242]:
seed = 42

In [243]:
X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [244]:
print(f"Train samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

Train samples: 30732
Test samples: 7684


Implement the method to deal with categorical variables. Briefly explain the chosen alternative. What
would happen if a variable in the test set contains a category that doesn’t exist on the train set? How
would you deal with this situation?

In [245]:
def categorical_evaluation(dataset,alpha=0.5):
    
    #
    # alpha variable represents how much weight the feature "metastatic tumor"
    # has on the categorical variable.
    # This choice is given by the fact that features might need to be more spread out.
    #
    # NOTE: alpha is not an tunable hyperparameter, but might need to be suggested by a 
    # medic or someone who has knowledge in the field.
    #


    cv = dataset.select_dtypes(include=['object'])
    for cat in cv.columns.tolist():
        enum = categorical_enumeration(cat=cat,categorical_variables=cv,printing=False)
        
        # The ranking process starts here
    
        for i,e in enumerate(enum[:,0]):

            # Increase the feature "importance rank" by a slight correlation with metastatic tumor variable
            increase = np.sum(np.logical_and(dataset['metastatic_tumor']==1,dataset[cat]==e))
            
            # This increase is regulate by the aforementioned alpha parameter 
            enum[i,1] += enum[i,1]*(alpha*increase) 

            
        enum[:,1] /= np.sum(enum[:,1]) # Represent the percentage of appearance in the given dataset (instead of the frequency)
        
        for i,e in enumerate(enum[:,0]):
            # print(f"{e}={enum[i,0]}\n-> New value:{enum[i,1]}")
            dataset[dataset[cat]==e] = enum[i,1]

    return dataset


In [246]:
X_train_categorical_mapped = categorical_evaluation(X_train)
X_test_categorical_mapped = categorical_evaluation(X_test)

In [247]:
X_train_categorical_mapped

Unnamed: 0,age,barthel,charlson,codidiagingreso,codservicioreal,creatinina,drg,estancias,glucosa,hematocrito,leucocitos,metastatic_tumor,num_grupoact3_HOSP,numurgenciasprevias,potasio,proteina_c_reactiva,rdw_cv,rdw_sd,sodio,urea
29147,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072
10512,0.126710,0.126710,0.126710,0.12671,0.12671,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710
32039,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563
17925,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072,0.050072
16317,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.126710,0.126710,0.126710,0.12671,0.12671,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710,0.126710
11284,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996
38158,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563,0.230563
860,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996,0.108996


In [248]:
X_test_categorical_mapped

Unnamed: 0,age,barthel,charlson,codidiagingreso,codservicioreal,creatinina,drg,estancias,glucosa,hematocrito,leucocitos,metastatic_tumor,num_grupoact3_HOSP,numurgenciasprevias,potasio,proteina_c_reactiva,rdw_cv,rdw_sd,sodio,urea
36169,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464
13759,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942,0.022942
27749,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464
10013,0.229010,0.229010,0.229010,0.22901,0.22901,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010
17728,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14467,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856,0.156856
18967,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464,0.065464
14321,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129
21688,0.229010,0.229010,0.229010,0.22901,0.22901,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010,0.229010


Implement the method to deal with the missing values. Briefly explain the chosen alternative. What
would happen if a variable without missing on the train set appears to have been missing in the test
set? How would you deal with them?

The transformation applied on the training set of the data should be the same applied to the test set