<a href="https://colab.research.google.com/github/VladimirVulpe/crs/blob/main/crs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get the Rhinosinusitis datasets from Google Drive

In [None]:
!pip install PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd
pd.set_option('max_rows', 200)

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

dataset_title = 'Rhinosinusitis culture results - Sheet1.csv'

downloaded = drive.CreateFile({'id':"1R5FNtJ0gyYTLDrt0TQnmlSnrRWcrfDTn"})   # replace the id with id of file you want to access
downloaded.GetContentFile(dataset_title)    # replace the file name with your file


# Get the microbes sorted by patients



In [None]:
sinus = pd.read_csv(dataset_title, index_col=0)

patient_code = 'patient (code)'

patients = sinus[patient_code].dropna().unique()

patient_cols = [patient_code, 'microbe']
#microbes_grouped_by_patients = sinus[patient_cols].dropna()
microbes_grouped_by_patients = sinus.groupby(patient_code)['microbe'].apply(list).reset_index(name='microbe')

microbes_grouped_by_patients['disease'] = pd.Series('CRSwP', index=microbes_grouped_by_patients.index)

print("\n\nPATIENTS: \n", patients)
microbes_grouped_by_patients

# Get microbes details

In [None]:
microbes = sinus['microbe'].dropna()
print("\n\nMICROBES: \n", microbes)

microbes_sorted_by_general_occurence = microbes.value_counts(dropna=True)
print("\n\nMICROBES_SORTED_BY_GENERAL_OCCURENCE: \n", microbes_sorted_by_general_occurence)

# Training with microbes and patients

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


# Select data corresponding to features in feature_names
feature_names = ['patient (code)', 'microbe']
X = microbes_grouped_by_patients[feature_names]
y = microbes_grouped_by_patients['disease']
print(microbes_grouped_by_patients)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

#For model reproducibility, set a numeric value for random_state when specifying the model
sinus_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)

# TODO use imputation

# TODO: use encoder
#enc = OneHotEncoder(handle_unknown='ignore')
#enc.fit(X)

# Fit the model
sinus_model.fit(X_train_full, y_train)

predictions = sinus_model.predict(X_train_full)
print("\n\nPREDICTIONS: \n", predictions)

# Calculate MAE
mae = mean_absolute_error(predictions, y_valid)