In [22]:
import os 
import numpy as np
import pandas as pd 

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load and prepare data

Load data, it is assumed that folder data are in the same parent directory.  

In [23]:
current_dir = os.getcwd()
data_path = os.path.join(current_dir, "data")
annotation_path = os.path.join(data_path, "clinical_annotation.csv")

In [24]:
# load data 
df = pd.read_csv(annotation_path)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           205 non-null    object 
 1   LABEL        205 non-null    int64  
 2   GENDER       205 non-null    object 
 3   DOB          205 non-null    object 
 4   LYMPH_COUNT  205 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 8.1+ KB


Now, the dataset need to be transformed in order to feed our baseline model: Naive Bayes. 

In [25]:
# remove label -1 corresponding to unalbeeled data 
df = df[df.LABEL >= 0]

# compute age 
def compute_age(x):
    year = int(x[-4:])
    return 2024 - year
    
df["age"] = df.DOB.apply(compute_age)
df.drop("DOB", axis=1, inplace=True)

df.set_index("ID", inplace=True)
df.head()

Unnamed: 0_level_0,LABEL,GENDER,LYMPH_COUNT,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P26,1,M,11.2,91
P183,1,M,12.8,82
P89,1,M,9.6,89
P123,1,M,122.6,93
P61,1,F,11.6,93


# Model training and evaluation 

Prepare data for training

In [26]:
# Encode gender column
df["GENDER"] = df["GENDER"].replace('f', "F")  # little fix
label_encoder = LabelEncoder()
df['GENDER'] = label_encoder.fit_transform(df['GENDER'])

In [27]:
# seperate features from labels
X = df.drop("LABEL", axis=1).values
y = df.LABEL.values

Define and train the model

In [28]:
# Create Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Define balanced accuracy scorer
balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)

# Perform cross-validation
cv_scores = cross_val_score(gnb, X, y, cv=10, scoring=balanced_accuracy_scorer)

# Print the cross-validation scores
print("Cross-validation Balanced Accuracy Scores:", cv_scores)
print(f"Mean Balanced Accuracy: {round(np.mean(cv_scores), 2)} +/- {round(np.std(cv_scores), 2)}")

Cross-validation Balanced Accuracy Scores: [0.91666667 0.95833333 0.875      0.90909091 0.86363636 0.86363636
 0.86363636 0.95454545 0.71818182 0.68181818]
Mean Balanced Accuracy: 0.86 +/- 0.09


# Conclusion 

Thus, a baseline has been constructed leading to interesting results as it outperforms clearly the baseline of the Kaggle and such a submission would be rank top 15 on the leaderboard. 

# MUST BE REMOVED

In [34]:
20*0.7

14.0