Dataset Description

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Kidney Stone Prediction based on Urine Analysis dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.
Files

    train.csv - the training dataset; target is the likelihood of a kidney stone being present
    test.csv - the test dataset; your objective is to predict the probability of target
    sample_submission.csv - a sample submission file in the correct format


In [1]:
import pandas as pd
import numpy as np
import sklearn


In [8]:
df = pd.read_csv('d:/Nowy folder/train.csv')

In [9]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,id,gravity,ph,osmo,cond,urea,calc,target
0,0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1,1.025,5.40,703,23.6,394,4.18,0
2,2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,4,1.021,5.53,874,17.8,385,2.21,1
5,5,5,1.025,6.90,947,28.4,395,2.64,1
6,6,6,1.008,5.09,371,15.5,159,2.17,1
7,7,7,1.015,5.53,450,8.1,170,1.16,0
8,8,8,1.025,7.38,736,25.3,418,1.52,0
9,9,9,1.017,6.61,527,20.0,75,4.49,1


In [10]:
df.set_index("id",inplace = True)

In [12]:
df.drop("index",axis=1,inplace=True)

In [13]:
df

Unnamed: 0_level_0,gravity,ph,osmo,cond,urea,calc,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.013,6.19,443,14.8,124,1.45,0
1,1.025,5.40,703,23.6,394,4.18,0
2,1.009,6.13,371,24.5,159,9.04,0
3,1.021,4.91,442,20.8,398,6.63,1
4,1.021,5.53,874,17.8,385,2.21,1
5,1.025,6.90,947,28.4,395,2.64,1
6,1.008,5.09,371,15.5,159,2.17,1
7,1.015,5.53,450,8.1,170,1.16,0
8,1.025,7.38,736,25.3,418,1.52,0
9,1.017,6.61,527,20.0,75,4.49,1


In [14]:
X = df.drop('target',axis=1)
y = df['target']

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#Setting an estimator

In [36]:
clf = RandomForestClassifier()
grid_cv = {'n_estimators': np.arange(50,100,10),
           'max_features':['auto']}
    

In [44]:
df_clf = GridSearchCV(clf,param_grid = grid_cv,cv=5)

In [45]:
#Training an estimator

In [46]:
df_clf.fit(X,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [47]:
#Downloading test data

In [49]:
df2 = pd.read_csv('d:/Nowy folder/test.csv')
df2.set_index('id',inplace=True)
df2

Unnamed: 0_level_0,gravity,ph,osmo,cond,urea,calc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
414,1.017,5.24,345,11.5,152,1.16
415,1.020,5.68,874,29.0,385,3.46
416,1.024,5.36,698,19.5,354,13.00
417,1.020,5.33,668,25.3,252,3.46
418,1.011,5.87,567,29.0,457,2.36
419,1.023,5.77,749,20.0,385,4.49
420,1.015,5.94,461,17.4,195,3.93
421,1.024,5.53,1236,29.0,620,12.68
422,1.010,6.79,242,11.2,64,0.65
423,1.005,7.20,541,17.8,195,1.03


In [52]:
#Predicting likelihood of a kidney stone being present target = 1,or not target = 0 
target  = df_clf.predict(df2)
target

array([0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0], dtype=int64)

In [54]:
df2['target'] = target

In [58]:
df2.reset_index(inplace=True)
df2

Unnamed: 0,index,id,gravity,ph,osmo,cond,urea,calc,target
0,0,414,1.017,5.24,345,11.5,152,1.16,0
1,1,415,1.020,5.68,874,29.0,385,3.46,1
2,2,416,1.024,5.36,698,19.5,354,13.00,1
3,3,417,1.020,5.33,668,25.3,252,3.46,1
4,4,418,1.011,5.87,567,29.0,457,2.36,0
5,5,419,1.023,5.77,749,20.0,385,4.49,1
6,6,420,1.015,5.94,461,17.4,195,3.93,0
7,7,421,1.024,5.53,1236,29.0,620,12.68,1
8,8,422,1.010,6.79,242,11.2,64,0.65,0
9,9,423,1.005,7.20,541,17.8,195,1.03,0


In [60]:
df3 = pd.DataFrame()
df3['id'] = df2['id']
df3['target'] = df2['target']

In [62]:
#df3 -> Data frame with demanded format of output data
df3

Unnamed: 0,id,target
0,414,0
1,415,1
2,416,1
3,417,1
4,418,0
5,419,1
6,420,0
7,421,1
8,422,0
9,423,0
