In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import catboost
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train = pd.read_csv(r"C:\Users\PC\Downloads\Data\training.csv")
test = pd.read_csv(r"C:\Users\PC\Downloads\Data\test.csv")
sample = pd.read_csv(r"C:\Users\PC\Downloads\Data\sample.csv")

In [3]:
raw_all = pd.concat([train.drop(columns='DiagPeriodL90D'), test], axis=0).reset_index()
target = train['DiagPeriodL90D']

In [4]:
train.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
0,475714,,MEDICAID,CA,924,84,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,12.871429,22.542857,10.1,27.814286,11.2,3.5,52.23721,8.650555,18.606528,1
1,349367,White,COMMERCIAL,CA,928,62,F,28.49,C50411,Malig neoplm of upper-outer quadrant of right ...,...,8.957576,10.109091,8.057576,30.606061,7.018182,4.10303,42.301121,8.487175,20.113179,1
2,138632,White,COMMERCIAL,TX,760,43,F,38.09,C50112,Malignant neoplasm of central portion of left ...,...,11.253333,9.663333,3.356667,31.394915,15.066667,7.446667,40.108207,7.642753,14.839351,1
3,617843,White,COMMERCIAL,CA,926,45,F,,C50212,Malig neoplasm of upper-inner quadrant of left...,...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,0
4,817482,,COMMERCIAL,ID,836,55,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,15.276,11.224,1.946,26.170213,12.088,13.106,41.356058,4.110749,11.722197,0


In [5]:
print('Train shape:',train.shape)
print('Test shape:',test.shape)
print('raw_all shape:',raw_all.shape)

Train shape: (12906, 83)
Test shape: (5792, 82)
raw_all shape: (18698, 83)


## Missing Data Analysis

Calculating the percentage of missing data

In [6]:
for column in train.columns:
    null_count = train[column].isnull().sum()
    if null_count > 0:
        percent_nulls = (null_count / train.shape[0]) * 100
        print(column,':', round(percent_nulls,2),'nulls')

patient_race : 49.47 nulls
payer_type : 13.97 nulls
patient_state : 0.4 nulls
bmi : 69.46 nulls
metastatic_first_novel_treatment : 99.81 nulls
metastatic_first_novel_treatment_type : 99.81 nulls
Region : 0.4 nulls
Division : 0.4 nulls
population : 0.01 nulls
density : 0.01 nulls
age_median : 0.01 nulls
age_under_10 : 0.01 nulls
age_10_to_19 : 0.01 nulls
age_20s : 0.01 nulls
age_30s : 0.01 nulls
age_40s : 0.01 nulls
age_50s : 0.01 nulls
age_60s : 0.01 nulls
age_70s : 0.01 nulls
age_over_80 : 0.01 nulls
male : 0.01 nulls
female : 0.01 nulls
married : 0.01 nulls
divorced : 0.01 nulls
never_married : 0.01 nulls
widowed : 0.01 nulls
family_size : 0.03 nulls
family_dual_income : 0.03 nulls
income_household_median : 0.03 nulls
income_household_under_5 : 0.03 nulls
income_household_5_to_10 : 0.03 nulls
income_household_10_to_15 : 0.03 nulls
income_household_15_to_20 : 0.03 nulls
income_household_20_to_25 : 0.03 nulls
income_household_25_to_35 : 0.03 nulls
income_household_35_to_50 : 0.03 nulls

Identifying Columns with High Missing Values in the Train Dataset

In [7]:
high_nulls = []
for column in train.columns:
    null_count = train[column].isnull().sum()
    percent_nulls = (null_count / train.shape[0]) * 100
    if percent_nulls > 1:
        high_nulls.append(column)
        print(column)


patient_race
payer_type
bmi
metastatic_first_novel_treatment
metastatic_first_novel_treatment_type


## Categorical and Numerical Columns Analysis

In [8]:
categorical_cols = raw_all.select_dtypes(include=['object', 'category']).columns
numerical_cols = raw_all.select_dtypes(include=['int64', 'float64']).columns
numerical_cols

Index(['index', 'patient_id', 'patient_zip3', 'patient_age', 'bmi',
       'population', 'density', 'age_median', 'age_under_10', 'age_10_to_19',
       'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s',
       'age_over_80', 'male', 'female', 'married', 'divorced', 'never_married',
       'widowed', 'family_size', 'family_dual_income',
       'income_household_median', 'income_household_under_5',
       'income_household_5_to_10', 'income_household_10_to_15',
       'income_household_15_to_20', 'income_household_20_to_25',
       'income_household_25_to_35', 'income_household_35_to_50',
       'income_household_50_to_75', 'income_household_75_to_100',
       'income_household_100_to_150', 'income_household_150_over',
       'income_household_six_figure', 'income_individual_median',
       'home_ownership', 'housing_units', 'home_value', 'rent_median',
       'rent_burden', 'education_less_highschool', 'education_highschool',
       'education_some_college', 'education_b

## Missing BMI Values Imputation with CatBoostRegressor

Training a CatBoost model on the concatenated dataset of train and test (raw_all) because training the model on a larger dataset leads to more robust imputations.

In [9]:
# Label Encoding Categorical Columns in raw_all
le = LabelEncoder()
for col in categorical_cols:
    raw_all[col] = le.fit_transform(raw_all[col])
    
train_copy = train.copy()
test_copy = test.copy()    
    
miss_bmi = raw_all['bmi'].isna()
train_bmi = raw_all[~miss_bmi]
impute_bmi = raw_all[miss_bmi]

X = train_bmi.drop(columns=['index', 'patient_id', 'bmi'])
y = train_bmi['bmi']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

# catboost_model = catboost.CatBoostRegressor()
# param_grid = {
#     'iterations': [500, 1000],
#     'depth': [3, 4, 6],
#     'colsample_bylevel': [1.0],
#     'learning_rate': [0.05, 0.1] }
# reg_cv = GridSearchCV(catboost_model, param_grid, verbose=3)
# reg_cv.fit(X_train, y_train)

# Train the CatBoostRegressor with the best hyperparameters
catboost_bmi_pred = catboost.CatBoostRegressor(depth=3, iterations= 500, learning_rate= 0.05, colsample_bylevel=1.0)
catboost_bmi_pred.fit(X_train, y_train)

catboost_predictions = catboost_bmi_pred.predict(X_test)
catboost_bmi_pred.score(X_test, y_test)

bmi_pred = catboost_bmi_pred.predict(impute_bmi.drop(columns=['index', 'patient_id', 'bmi']))

# Update missing 'bmi' values 
missing_bmi_index = raw_all[raw_all['bmi'].isna()].index.tolist()

for count, idx in enumerate(missing_bmi_index):
    raw_all.loc[idx, 'bmi'] = bmi_pred[count]

0:	learn: 5.6254786	total: 142ms	remaining: 1m 10s
1:	learn: 5.6219263	total: 145ms	remaining: 36s
2:	learn: 5.6168841	total: 147ms	remaining: 24.3s
3:	learn: 5.6148042	total: 149ms	remaining: 18.5s
4:	learn: 5.6110595	total: 152ms	remaining: 15s
5:	learn: 5.6066225	total: 154ms	remaining: 12.7s
6:	learn: 5.6032090	total: 157ms	remaining: 11s
7:	learn: 5.5981752	total: 159ms	remaining: 9.79s
8:	learn: 5.5948064	total: 162ms	remaining: 8.82s
9:	learn: 5.5912638	total: 164ms	remaining: 8.03s
10:	learn: 5.5886452	total: 166ms	remaining: 7.37s
11:	learn: 5.5862668	total: 168ms	remaining: 6.83s
12:	learn: 5.5839846	total: 170ms	remaining: 6.38s
13:	learn: 5.5812868	total: 173ms	remaining: 5.99s
14:	learn: 5.5791238	total: 175ms	remaining: 5.65s
15:	learn: 5.5758831	total: 177ms	remaining: 5.35s
16:	learn: 5.5732952	total: 179ms	remaining: 5.08s
17:	learn: 5.5714115	total: 181ms	remaining: 4.84s
18:	learn: 5.5695245	total: 183ms	remaining: 4.62s
19:	learn: 5.5666543	total: 185ms	remaining: 4

210:	learn: 5.3577761	total: 544ms	remaining: 746ms
211:	learn: 5.3563206	total: 546ms	remaining: 742ms
212:	learn: 5.3551087	total: 548ms	remaining: 738ms
213:	learn: 5.3543482	total: 550ms	remaining: 735ms
214:	learn: 5.3528374	total: 551ms	remaining: 731ms
215:	learn: 5.3519428	total: 554ms	remaining: 728ms
216:	learn: 5.3505682	total: 556ms	remaining: 725ms
217:	learn: 5.3491780	total: 558ms	remaining: 722ms
218:	learn: 5.3482317	total: 560ms	remaining: 718ms
219:	learn: 5.3469053	total: 562ms	remaining: 715ms
220:	learn: 5.3458655	total: 564ms	remaining: 711ms
221:	learn: 5.3448548	total: 565ms	remaining: 708ms
222:	learn: 5.3436575	total: 567ms	remaining: 704ms
223:	learn: 5.3425237	total: 569ms	remaining: 701ms
224:	learn: 5.3409228	total: 571ms	remaining: 698ms
225:	learn: 5.3398275	total: 573ms	remaining: 695ms
226:	learn: 5.3386111	total: 575ms	remaining: 692ms
227:	learn: 5.3371922	total: 577ms	remaining: 688ms
228:	learn: 5.3363791	total: 582ms	remaining: 688ms
229:	learn: 

429:	learn: 5.1517786	total: 935ms	remaining: 152ms
430:	learn: 5.1511147	total: 937ms	remaining: 150ms
431:	learn: 5.1502959	total: 939ms	remaining: 148ms
432:	learn: 5.1496434	total: 941ms	remaining: 146ms
433:	learn: 5.1488067	total: 944ms	remaining: 143ms
434:	learn: 5.1480493	total: 945ms	remaining: 141ms
435:	learn: 5.1470539	total: 948ms	remaining: 139ms
436:	learn: 5.1461344	total: 950ms	remaining: 137ms
437:	learn: 5.1454024	total: 952ms	remaining: 135ms
438:	learn: 5.1448414	total: 954ms	remaining: 133ms
439:	learn: 5.1440889	total: 956ms	remaining: 130ms
440:	learn: 5.1432901	total: 958ms	remaining: 128ms
441:	learn: 5.1424554	total: 960ms	remaining: 126ms
442:	learn: 5.1419297	total: 962ms	remaining: 124ms
443:	learn: 5.1409135	total: 964ms	remaining: 122ms
444:	learn: 5.1400726	total: 966ms	remaining: 119ms
445:	learn: 5.1391947	total: 967ms	remaining: 117ms
446:	learn: 5.1382047	total: 970ms	remaining: 115ms
447:	learn: 5.1373659	total: 972ms	remaining: 113ms
448:	learn: 

After BMI Imputation: Updating Train and Test Sets with Imputed Values

In [10]:
train = raw_all[:train.shape[0]].copy()
train['DiagPeriodL90D'] = target
test = raw_all[train.shape[0]:].copy().reset_index()

train['bmi'] = round(train['bmi'], 2)
test['bmi'] = round(test['bmi'], 2)

train.drop("index",axis=1,inplace=True)
test.drop(columns=['level_0' ,'index'],inplace=True)

#Recovering Original Categorical Columns After Label Encoding
train[categorical_cols] = train_copy[categorical_cols]
test[categorical_cols] = train_copy[categorical_cols]

## Data Imputation: Iterative Imputation for Numerical Features and Mode Fill for Categorical Features 

Iterative Imputation: Machine Learning technique used for handling missing values

In [11]:
categorical_cols = train.drop('DiagPeriodL90D',axis=1).select_dtypes(include=['object', 'category']).columns
numerical_cols = train.drop('DiagPeriodL90D',axis=1).select_dtypes(include=['int64', 'float64']).columns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=0)
train[numerical_cols]= imputer.fit_transform(train[numerical_cols])
test[numerical_cols]= imputer.transform(test[numerical_cols])

for col in categorical_cols:
       train[col] = train[col].fillna(train[col].mode()[0])
       test[col] =test[col].fillna(train[col].mode()[0])




## Feature Engineering

In [12]:
train['underweight']= np.where(train['bmi']<18.5, 1, 0) 
train['obese'] = np.where(train['bmi']>30, 1, 0) 
train['young_ind'] = np.where(train['patient_age']<40, 1, 0) 

test['underweight']= np.where(test['bmi']<18.5, 1, 0) 
test['obese'] = np.where(test['bmi']>30, 1, 0) 
test['young_ind'] = np.where(test['patient_age']<40, 1, 0) 


In [13]:
train['age_ratio'] = (train['age_under_10'] + train['age_10_to_19']) / (train['age_70s'] + train['age_over_80']+1)
test['age_ratio'] = (test['age_under_10'] + test['age_10_to_19']) / (test['age_70s'] + test['age_over_80']+1)

train['air_quality'] = train['Ozone'] + train['PM25'] + train['N02']
test['air_quality'] = test['Ozone'] + test['PM25'] + test['N02']

train['race_ration'] = (train['race_black'] + train['race_asian'] + train['race_native'] + train['race_pacific']+ train['race_other']+ train['race_multiple']+ train['hispanic'])/train['race_white']
test['race_ration'] = (test['race_black'] + test['race_asian'] + test['race_native'] + test['race_pacific']+ test['race_other']+ test['race_multiple']+ test['hispanic'])/test['race_white']

 Preprocessing and Modifications in the 'breast_cancer_diagnosis_desc' Column

In [14]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove Punctuation
    tokens = [token for token in tokens if token.isalnum()]
    
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return ' '.join(tokens)

####################################################################

print('BEFORE PREPROCESSING:')
print('Unique values of breast_cancer_diagnosis_desc column:\n', train['breast_cancer_diagnosis_desc'].unique(),'\n')
print('Number of Unique values of breast_cancer_diagnosis_desc column:', train['breast_cancer_diagnosis_desc'].nunique(),'\n\n')

train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(preprocess_text)
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(preprocess_text)


train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('malignant', 'malig').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('neoplm', 'neoplasm').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('female', '').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('breast', '').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('male', '').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('unspecified', 'unsp').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('ovrlp', 'overlapping').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('sites', '').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('site', '').strip())
train['breast_cancer_diagnosis_desc']=train['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('  ', ' ').strip())

test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('malignant', 'malig').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('neoplm', 'neoplasm').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('female', '').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('breast', '').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('male', '').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('unspecified', 'unsp').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('ovrlp', 'overlapping').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('sites', '').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('site', '').strip())
test['breast_cancer_diagnosis_desc']=test['breast_cancer_diagnosis_desc'].apply(lambda x: x.replace('  ', ' ').strip())

print('AFTER PREPROCESSING:')
print('Unique values of breast_cancer_diagnosis_desc column:\n', train['breast_cancer_diagnosis_desc'].unique(),'\n')
print('Number of Unique values of breast_cancer_diagnosis_desc column:', train['breast_cancer_diagnosis_desc'].nunique())

BEFORE PREPROCESSING:
Unique values of breast_cancer_diagnosis_desc column:
 ['Malignant neoplasm of unsp site of unspecified female breast'
 'Malig neoplm of upper-outer quadrant of right female breast'
 'Malignant neoplasm of central portion of left female breast'
 'Malig neoplasm of upper-inner quadrant of left female breast'
 'Malignant neoplasm of breast (female), unspecified'
 'Malignant neoplasm of unspecified site of left female breast'
 'Malig neoplasm of lower-outer quadrant of left female breast'
 'Malignant neoplasm of upper-outer quadrant of female breast'
 'Malig neoplasm of upper-outer quadrant of left female breast'
 'Malignant neoplasm of ovrlp sites of left female breast'
 'Malignant neoplasm of unsp site of right female breast'
 'Malig neoplasm of lower-inner quadrant of left female breast'
 'Malig neoplm of lower-inner quadrant of right female breast'
 'Malignant neoplasm of central portion of right female breast'
 'Malignant neoplasm of central portion of female br

# Feature Selection

Dropping Unrelated columns

In [15]:
train.drop(columns= ['population','density','age_median','age_under_10','age_10_to_19','age_20s','age_30s',
                     'age_40s','age_50s','age_60s','age_70s','age_over_80','male','female','widowed','divorced',
                     'family_size','family_dual_income','income_household_under_5','income_household_5_to_10',
                     'income_household_10_to_15','income_household_15_to_20','income_household_20_to_25',
                     'income_household_25_to_35','income_household_35_to_50','income_household_50_to_75',
                     'income_household_75_to_100','income_household_100_to_150','income_household_150_over',
                     'income_household_six_figure','home_value','rent_median','rent_burden','education_bachelors',
                     'education_stem_degree','race_white','race_black','race_asian','race_native','race_pacific','race_other',
                     'race_multiple','limited_english','Ozone','PM25','N02','hispanic','education_less_highschool'],axis=1,inplace=True)

test.drop(columns= ['population','density','age_median','age_under_10','age_10_to_19','age_20s','age_30s',
                     'age_40s','age_50s','age_60s','age_70s','age_over_80','male','female','widowed','divorced',
                     'family_size','family_dual_income','income_household_under_5','income_household_5_to_10',
                     'income_household_10_to_15','income_household_15_to_20','income_household_20_to_25',
                     'income_household_25_to_35','income_household_35_to_50','income_household_50_to_75',
                     'income_household_75_to_100','income_household_100_to_150','income_household_150_over',
                     'income_household_six_figure','home_value','rent_median','rent_burden','education_bachelors',
                     'education_stem_degree','race_white','race_black','race_asian','race_native','race_pacific','race_other',
                     'race_multiple','limited_english','Ozone','PM25','N02','hispanic','education_less_highschool'],axis=1,inplace=True)

In [16]:
train.drop(columns= high_nulls , axis=1,inplace=True)
test.drop(columns= high_nulls , axis=1,inplace=True)

In [17]:
train.shape

(12906, 36)

In [18]:
test.shape

(5792, 35)

## Label Encoding Categorical Columns  

In [19]:
categorical_cols = test.select_dtypes(include=['object', 'category']).columns
numerical_cols = test.select_dtypes(include=['int64', 'float64']).columns

le = LabelEncoder()
for col in categorical_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])

## Grouped Cross-Validated LightGBM Model

In [20]:
target = "DiagPeriodL90D"
y=train[target]
X=train.drop([target],axis=1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Hyper Parameter Tuning

In [22]:
# from sklearn.model_selection import GridSearchCV
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import GroupKFold

# gf = GroupKFold(n_splits=4)

# groups = np.array(train.Division)

# param_grid = {
#     'n_estimators': [10, 100, 500],  
#     'learning_rate': [0.01, 0.03, 0.1,0.06,0.5],
#     'num_leaves': [5, 10, 20],
#     'max_depth': [3, 5, 7]
# }

# base_model = LGBMClassifier()

# grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid,
#                            scoring='roc_auc', cv=gf.split(X, y, groups=groups),
#                            verbose=1, n_jobs=-1)

# grid_search.fit(X, y, groups=groups)

# print("Best parameters found: ", grid_search.best_params_)


In [23]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GroupKFold

gf = GroupKFold(n_splits=4)
groups = np.array(train.Division)

test_probs = []
train_probs = []

y_train = train[[target]].copy()

for i, (train_idx, val_idx) in enumerate(gf.split(X, y_train["DiagPeriodL90D"], groups)):
    X_train_fold = X.iloc[train_idx, :]
    y_train_fold = y_train.iloc[train_idx]["DiagPeriodL90D"]

    X_val_fold = X.iloc[val_idx, :]
    y_val_fold = y_train.iloc[val_idx]["DiagPeriodL90D"]

    print("Fold: ",(i + 1))

    learner = LGBMClassifier( n_estimators=500, learning_rate=0.1, num_leaves=10, max_depth=7, early_stopping_rounds=200 )
    

    learner.fit(X_train_fold, y_train_fold, eval_metric="auc", eval_set=[(X_train_fold, y_train_fold), (X_val_fold, y_val_fold)])

 
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1],
                                 index=test.index, name="fold_" + str(i)))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)


Fold:  1
[1]	training's auc: 0.797392	training's binary_logloss: 0.626469	valid_1's auc: 0.790486	valid_1's binary_logloss: 0.625504
[2]	training's auc: 0.802808	training's binary_logloss: 0.598716	valid_1's auc: 0.791036	valid_1's binary_logloss: 0.597369
[3]	training's auc: 0.803199	training's binary_logloss: 0.576258	valid_1's auc: 0.789516	valid_1's binary_logloss: 0.574645
[4]	training's auc: 0.808268	training's binary_logloss: 0.557932	valid_1's auc: 0.791708	valid_1's binary_logloss: 0.556086
[5]	training's auc: 0.810416	training's binary_logloss: 0.542635	valid_1's auc: 0.792193	valid_1's binary_logloss: 0.541355
[6]	training's auc: 0.811718	training's binary_logloss: 0.5299	valid_1's auc: 0.791977	valid_1's binary_logloss: 0.528667
[7]	training's auc: 0.814495	training's binary_logloss: 0.519235	valid_1's auc: 0.793402	valid_1's binary_logloss: 0.518346
[8]	training's auc: 0.815912	training's binary_logloss: 0.510144	valid_1's auc: 0.793028	valid_1's binary_logloss: 0.509601
[

[143]	training's auc: 0.897025	training's binary_logloss: 0.404078	valid_1's auc: 0.798536	valid_1's binary_logloss: 0.465334
[144]	training's auc: 0.897069	training's binary_logloss: 0.403791	valid_1's auc: 0.798584	valid_1's binary_logloss: 0.465386
[145]	training's auc: 0.897365	training's binary_logloss: 0.40352	valid_1's auc: 0.798657	valid_1's binary_logloss: 0.465374
[146]	training's auc: 0.897508	training's binary_logloss: 0.403207	valid_1's auc: 0.798773	valid_1's binary_logloss: 0.465573
[147]	training's auc: 0.89768	training's binary_logloss: 0.402889	valid_1's auc: 0.798581	valid_1's binary_logloss: 0.465699
[148]	training's auc: 0.897863	training's binary_logloss: 0.402587	valid_1's auc: 0.798455	valid_1's binary_logloss: 0.465695
[149]	training's auc: 0.897922	training's binary_logloss: 0.402281	valid_1's auc: 0.798528	valid_1's binary_logloss: 0.465566
[150]	training's auc: 0.898132	training's binary_logloss: 0.401936	valid_1's auc: 0.798534	valid_1's binary_logloss: 0.4

[10]	training's auc: 0.823631	training's binary_logloss: 0.499761	valid_1's auc: 0.800496	valid_1's binary_logloss: 0.488615
[11]	training's auc: 0.823408	training's binary_logloss: 0.494145	valid_1's auc: 0.798735	valid_1's binary_logloss: 0.483544
[12]	training's auc: 0.825213	training's binary_logloss: 0.48887	valid_1's auc: 0.800388	valid_1's binary_logloss: 0.478428
[13]	training's auc: 0.826924	training's binary_logloss: 0.484672	valid_1's auc: 0.79793	valid_1's binary_logloss: 0.475296
[14]	training's auc: 0.828467	training's binary_logloss: 0.480977	valid_1's auc: 0.797574	valid_1's binary_logloss: 0.472029
[15]	training's auc: 0.828846	training's binary_logloss: 0.477819	valid_1's auc: 0.797364	valid_1's binary_logloss: 0.469564
[16]	training's auc: 0.829773	training's binary_logloss: 0.474813	valid_1's auc: 0.797326	valid_1's binary_logloss: 0.467138
[17]	training's auc: 0.831055	training's binary_logloss: 0.472353	valid_1's auc: 0.797305	valid_1's binary_logloss: 0.465482
[1

[163]	training's auc: 0.900276	training's binary_logloss: 0.400869	valid_1's auc: 0.780385	valid_1's binary_logloss: 0.472319
[164]	training's auc: 0.900579	training's binary_logloss: 0.400493	valid_1's auc: 0.780628	valid_1's binary_logloss: 0.472226
[165]	training's auc: 0.900772	training's binary_logloss: 0.400132	valid_1's auc: 0.780906	valid_1's binary_logloss: 0.472121
[166]	training's auc: 0.900825	training's binary_logloss: 0.399794	valid_1's auc: 0.78085	valid_1's binary_logloss: 0.472292
[167]	training's auc: 0.90129	training's binary_logloss: 0.399313	valid_1's auc: 0.780887	valid_1's binary_logloss: 0.472265
[168]	training's auc: 0.901489	training's binary_logloss: 0.399058	valid_1's auc: 0.780845	valid_1's binary_logloss: 0.472307
[169]	training's auc: 0.902244	training's binary_logloss: 0.398634	valid_1's auc: 0.780514	valid_1's binary_logloss: 0.472549
[170]	training's auc: 0.902671	training's binary_logloss: 0.398378	valid_1's auc: 0.780588	valid_1's binary_logloss: 0.4

[95]	training's auc: 0.880588	training's binary_logloss: 0.419653	valid_1's auc: 0.776412	valid_1's binary_logloss: 0.477627
[96]	training's auc: 0.880826	training's binary_logloss: 0.419329	valid_1's auc: 0.776337	valid_1's binary_logloss: 0.477797
[97]	training's auc: 0.881163	training's binary_logloss: 0.41887	valid_1's auc: 0.776402	valid_1's binary_logloss: 0.477807
[98]	training's auc: 0.881378	training's binary_logloss: 0.418462	valid_1's auc: 0.776769	valid_1's binary_logloss: 0.477486
[99]	training's auc: 0.881684	training's binary_logloss: 0.418204	valid_1's auc: 0.777064	valid_1's binary_logloss: 0.477386
[100]	training's auc: 0.882151	training's binary_logloss: 0.417813	valid_1's auc: 0.777155	valid_1's binary_logloss: 0.477572
[101]	training's auc: 0.882465	training's binary_logloss: 0.417415	valid_1's auc: 0.777147	valid_1's binary_logloss: 0.477581
[102]	training's auc: 0.882902	training's binary_logloss: 0.416984	valid_1's auc: 0.776805	valid_1's binary_logloss: 0.47769

Fold:  4
[1]	training's auc: 0.801862	training's binary_logloss: 0.62192	valid_1's auc: 0.782899	valid_1's binary_logloss: 0.639839
[2]	training's auc: 0.805173	training's binary_logloss: 0.593786	valid_1's auc: 0.76427	valid_1's binary_logloss: 0.613932
[3]	training's auc: 0.811346	training's binary_logloss: 0.571026	valid_1's auc: 0.760666	valid_1's binary_logloss: 0.59271
[4]	training's auc: 0.816639	training's binary_logloss: 0.55238	valid_1's auc: 0.770035	valid_1's binary_logloss: 0.574787
[5]	training's auc: 0.818805	training's binary_logloss: 0.536994	valid_1's auc: 0.77696	valid_1's binary_logloss: 0.56025
[6]	training's auc: 0.82151	training's binary_logloss: 0.524088	valid_1's auc: 0.778048	valid_1's binary_logloss: 0.548002
[7]	training's auc: 0.82301	training's binary_logloss: 0.513344	valid_1's auc: 0.781184	valid_1's binary_logloss: 0.537922
[8]	training's auc: 0.824624	training's binary_logloss: 0.504278	valid_1's auc: 0.782415	valid_1's binary_logloss: 0.529711
[9]	tra

[139]	training's auc: 0.893661	training's binary_logloss: 0.401259	valid_1's auc: 0.783914	valid_1's binary_logloss: 0.494041
[140]	training's auc: 0.894087	training's binary_logloss: 0.400877	valid_1's auc: 0.783741	valid_1's binary_logloss: 0.494214
[141]	training's auc: 0.894301	training's binary_logloss: 0.400591	valid_1's auc: 0.783849	valid_1's binary_logloss: 0.493992
[142]	training's auc: 0.894367	training's binary_logloss: 0.400327	valid_1's auc: 0.783701	valid_1's binary_logloss: 0.494061
[143]	training's auc: 0.894684	training's binary_logloss: 0.400041	valid_1's auc: 0.78342	valid_1's binary_logloss: 0.494115
[144]	training's auc: 0.894751	training's binary_logloss: 0.399801	valid_1's auc: 0.782987	valid_1's binary_logloss: 0.494403
[145]	training's auc: 0.895041	training's binary_logloss: 0.399615	valid_1's auc: 0.783303	valid_1's binary_logloss: 0.494644
[146]	training's auc: 0.895342	training's binary_logloss: 0.39936	valid_1's auc: 0.784444	valid_1's binary_logloss: 0.4

In [24]:
test_probs

0       0.713014
1       0.745705
2       0.728055
3       0.773301
4       0.198957
          ...   
5787    0.754027
5788    0.780311
5789    0.806904
5790    0.748486
5791    0.773567
Length: 5792, dtype: float64

In [25]:
sample["DiagPeriodL90D"] = test_probs
sample

Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.713014
1,593679,0.745705
2,184532,0.728055
3,447383,0.773301
4,687972,0.198957
...,...,...
5787,977076,0.754027
5788,922960,0.780311
5789,759690,0.806904
5790,911717,0.748486


In [26]:
sample.to_csv('lgbm.csv', index=False)
sample

Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.713014
1,593679,0.745705
2,184532,0.728055
3,447383,0.773301
4,687972,0.198957
...,...,...
5787,977076,0.754027
5788,922960,0.780311
5789,759690,0.806904
5790,911717,0.748486
