In [88]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np
from scipy import stats 

In [89]:
# Let's load the first 100 samples from iris data set. It has two classes, 1 and 0, balanced.
X, y = load_iris(return_X_y=True)
binary_X, binary_y = X[:100], y[:100]

print (X.shape, y.shape)

(150, 4) (150,)


In [91]:
# split into training and test and train the logstic regression model 
X_train, X_test, y_train, y_test = train_test_split(binary_X, binary_y, test_size=0.33, random_state=42)
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [92]:
clf.predict(X_test)
y_pred = clf.predict_proba(X_test)
y_prod = [i[1] for i in y_pred]

In [93]:
df = pd.DataFrame({'prediction': y_prod, 'labels': y_test})
df['bins'] = pd.qcut(df['prediction'], 10)
df.head()

Unnamed: 0,prediction,labels,bins
0,0.998555,1,"(0.997, 0.999]"
1,0.982543,1,"(0.967, 0.983]"
2,0.996403,1,"(0.991, 0.997]"
3,0.035725,0,"(0.032, 0.036]"
4,0.065238,0,"(0.0466, 0.967]"


## How to calculate  KS stats
1. Sort the samples by their probabilities descendingly and group it into bins, normally 10 buckets.
2. Within each bin, we count the number of 0, and 1 events. 
3. Calculate the cumulative percentages of 0 and 1 events in each bin.
4. Calculate the differences of the two cumulative percentages, the max value will be the KS stats of the sample. 

In [99]:
def ks_stats(df, bin = 'bins', pred_col = 'prediction', labels = 'labels'):
    gdf = df.groupby([bin]).agg({pred_col: 'min', pred_col : 'max', labels: 'sum', bin : 'count'})
    gdf = gdf.rename(index = str, columns = { bin: 'count', pred_col: 'max', labels: 'true_events'}).reset_index()
    gdf['false_events'] = gdf['count'] - gdf['true_events']
    gdf.sort_values(bin, ascending = False, inplace = True)

    gdf['true_pct'] = gdf['true_events']/sum(gdf['true_events'])
    gdf['false_pct'] = gdf['false_events']/sum(gdf['false_events'])
    gdf['cum_true'] = gdf['true_pct'].cumsum()
    gdf['cum_false'] = gdf['false_pct'].cumsum()

    gdf['ks'] = gdf['cum_true'] - gdf['cum_false']
    return gdf.sort_values(bin, ascending = False)

In [101]:
ks_df = ks_stats(df)
ks_df

Unnamed: 0,bins,max,true_events,count,false_events,true_pct,false_pct,cum_true,cum_false,ks
9,"(0.997, 0.999]",0.998555,4,4,0,0.285714,0.0,0.285714,0.0,0.285714
8,"(0.991, 0.997]",0.996403,3,3,0,0.214286,0.0,0.5,0.0,0.5
7,"(0.983, 0.991]",0.990842,3,3,0,0.214286,0.0,0.714286,0.0,0.714286
6,"(0.967, 0.983]",0.982543,3,3,0,0.214286,0.0,0.928571,0.0,0.928571
5,"(0.0466, 0.967]",0.965853,1,3,2,0.071429,0.105263,1.0,0.105263,0.894737
4,"(0.036, 0.0466]",0.046563,0,4,4,0.0,0.210526,1.0,0.315789,0.684211
3,"(0.032, 0.036]",0.035725,0,3,3,0.0,0.157895,1.0,0.473684,0.526316
2,"(0.0253, 0.032]",0.030046,0,3,3,0.0,0.157895,1.0,0.631579,0.368421
1,"(0.0214, 0.0253]",0.024989,0,3,3,0.0,0.157895,1.0,0.789474,0.210526
0,"(0.007019999999999999, 0.0214]",0.021217,0,4,4,0.0,0.210526,1.0,1.0,0.0


Therefore, the KS value for our test is 0.93 and it means the model is good at differenciating the two classification samples. 
Below is the calculation from python package directly, it's slightly different then we created above as it didn't group the samples or you can think of it as every single sample is one bucket. 

In [98]:
from scipy.stats import ks_2samp
ks_2samp(df.loc[df.labels==0,"prediction"], df.loc[df.labels==1,"prediction"])

Ks_2sampResult(statistic=1.0, pvalue=2.442571478411537e-09)

In [39]:
https://www.listendata.com/2019/07/KS-Statistics-Python.html


In [43]:
# plot
import matplotlib.pyplot as plt
plt.step(ks_df['cum_true'], ks_df['cum_false'], where='post', label='cum_true')
x3 = np.linspace(-3, 3, 100)
plt.plot(x3, target.cdf(x3), label='CDF for N(0, 1)')
plt.ylim([0, 1]); plt.grid(True); plt.legend();
# Add vertical lines marking Dn+ and Dn-
iminus, iplus = np.argmax(gaps, axis=0)
plt.vlines([x[iminus]], ecdfs[iminus], cdfs[iminus], color='r', linestyle='dashed', lw=4)
plt.vlines([x[iplus]], cdfs[iplus], ecdfs[iplus+1], color='r', linestyle='dashed', lw=4)
plt.show()