# Considered models

Let's look at the popular classification models: 
- **K-Nearest Neighbors (KNN) Classification**
- **DecisionTreeClassifier**
- **LogRegression**
- **LinearSVC**

In [98]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [3]:
data_clients = pd.read_csv('prepared_data.csv')
data_clients.head()

Unnamed: 0,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_credit_cost_rate,pre_util,enc_paym_5,...,enc_paym_7,enc_paym_8,enc_paym_9,enc_paym_10,enc_paym_11,enc_paym_12,enc_paym_13,enc_paym_14,enc_loans_credit_status,flag
0,8.1,7.6,7.1,7.5,11.4,10.7,9.6,8.0,16.0,0.0,...,0.0,1.5,1.5,1.5,2.5,3.0,3.0,3.0,3.0,0
1,11.428571,7.642857,6.642857,7.928571,10.071429,7.0,8.142857,4.0,16.0,0.0,...,2.5,1.5,1.0,0.0,2.5,3.0,3.0,3.0,3.0,0
2,8.333333,10.666667,7.0,6.0,5.0,9.0,2.0,4.0,6.0,0.0,...,0.0,0.0,0.0,3.0,4.0,3.0,3.0,3.0,2.0,0
3,7.0,7.333333,7.6,7.8,5.2,8.133333,9.866667,4.0,16.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0,0
4,12.0,9.0,4.0,8.0,1.0,11.0,12.0,0.0,16.0,1.0,...,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,2.0,0


## Using StandardScaler or not

*The data spread is not too large, so it is unclear whether the data needs **to be scaled**, but we will try two ways.*
*Second problem - **unbalanced data**, we should try different methods to deal with it.* *Third problem - **dataset size**, let's try different sizes.*

In [10]:
data_clients.shape

(3000000, 21)

In [18]:
3000000*0.09

270000.0

In [5]:
data_clients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 21 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   pre_since_opened            float64
 1   pre_since_confirmed         float64
 2   pre_pterm                   float64
 3   pre_fterm                   float64
 4   pre_till_pclose             float64
 5   pre_till_fclose             float64
 6   pre_loans_credit_limit      float64
 7   pre_loans_credit_cost_rate  float64
 8   pre_util                    float64
 9   enc_paym_5                  float64
 10  enc_paym_6                  float64
 11  enc_paym_7                  float64
 12  enc_paym_8                  float64
 13  enc_paym_9                  float64
 14  enc_paym_10                 float64
 15  enc_paym_11                 float64
 16  enc_paym_12                 float64
 17  enc_paym_13                 float64
 18  enc_paym_14                 float64
 19  enc_loans_credit_stat

In [6]:
features_type_change = data_clients.columns

In [7]:
for i in range(len(features_type_change)):
    data_clients[features_type_change[i]] = data_clients[features_type_change[i]].astype('int')

## *K-Nearest Neighbors (KNN) Classification*

In [40]:
fractions = [.015, .025, .04]

oversample = RandomOverSampler(sampling_strategy=0.5)
undersample = RandomUnderSampler(sampling_strategy=0.5)
os= SMOTE()

balanced_methods = [oversample, undersample, os]
accuracy_scaled =[]
accuracy_not_scaled=[]
accuracy=[]

for i in range(len(fractions)):
    print(fractions[i])
    sample_prepared = data_clients.sample(frac =fractions[i]) 
    features= sample_prepared.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(sample_prepared[features], sample_prepared[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        knn_1=KNeighborsClassifier(n_neighbors=4)
        knn_1.fit(X_train,y_train)
        knn_2=KNeighborsClassifier(n_neighbors=4)
        knn_2.fit(X_train_scaled,y_train)
        pred_test_1 = knn_1.predict(X_test)
        pred_test_2 = knn_2.predict(X_test_scaled)
        accuracy.append([fractions[i], balanced_methods[l],'not_scaled: ', roc_auc_score(y_test, pred_test_1), 'scaled: ', roc_auc_score(y_test, pred_test_2)])
            

0.015


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.025


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.04


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [41]:
accuracy

[[0.015,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.9528406008679559,
  'scaled: ',
  0.9592745495351757],
 [0.015,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5058362675620398,
  'scaled: ',
  0.5231271697197503],
 [0.015,
  SMOTE(),
  'not_scaled: ',
  0.9034404602708566,
  'scaled: ',
  0.9035023547972515],
 [0.025,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.9547378525216457,
  'scaled: ',
  0.9594435485419186],
 [0.025,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5076374041915732,
  'scaled: ',
  0.5131059383044636],
 [0.025,
  SMOTE(),
  'not_scaled: ',
  0.9051054203442737,
  'scaled: ',
  0.899335992225017],
 [0.04,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.9511287673254412,
  'scaled: ',
  0.9574962915020762],
 [0.04,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.522688436329588,
  'scaled: ',
  0.5236579275905118],
 [0.04,
  SMOTE(),
  'not_s

## *DecisionTreeClassifier*

In [33]:
fractions = [.025, .05, .07]

oversample = RandomOverSampler(sampling_strategy=0.5)
undersample = RandomUnderSampler(sampling_strategy=0.5)
os= SMOTE()

balanced_methods = [oversample, undersample, os]
accuracy_scaled =[]
accuracy_not_scaled=[]
accuracy=[]

for i in range(len(fractions)):
    print(fractions[i])
    sample_prepared = data_clients.sample(frac =fractions[i]) 
    features= sample_prepared.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(sample_prepared[features], sample_prepared[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        tree_1=DecisionTreeClassifier()
        tree_1.fit(X_train,y_train)
        tree_2=DecisionTreeClassifier()
        tree_2.fit(X_train_scaled,y_train)
        pred_test_1 = tree_1.predict(X_test)
        pred_test_2 = tree_2.predict(X_test_scaled)
        accuracy.append([fractions[i], balanced_methods[l],'not_scaled: ', roc_auc_score(y_test, pred_test_1), 'scaled: ', roc_auc_score(y_test, pred_test_2)])
            

0.025
0.05
0.07


In [34]:
accuracy

[[0.025,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.974723917964195,
  'scaled: ',
  0.974209479388161],
 [0.025,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5309754818408287,
  'scaled: ',
  0.5400350363474791],
 [0.025,
  SMOTE(),
  'not_scaled: ',
  0.9034331692940094,
  'scaled: ',
  0.9040903648978149],
 [0.05,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.9731320990656153,
  'scaled: ',
  0.9732189377887387],
 [0.05,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5318685514452572,
  'scaled: ',
  0.5347693551166334],
 [0.05,
  SMOTE(),
  'not_scaled: ',
  0.9064876578175179,
  'scaled: ',
  0.9057949923299337],
 [0.07,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.9732229401816097,
  'scaled: ',
  0.9736043941364259],
 [0.07,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5333214765164911,
  'scaled: ',
  0.5308186345532425],
 [0.07,
  SMOTE(),
  'not_scal

## *LogRegression*

In [35]:
fractions = [.025, .05, .07]

oversample = RandomOverSampler(sampling_strategy=0.5)
undersample = RandomUnderSampler(sampling_strategy=0.5)
os= SMOTE()

balanced_methods = [oversample, undersample, os]
accuracy_scaled =[]
accuracy_not_scaled=[]
accuracy=[]

for i in range(len(fractions)):
    print(fractions[i])
    sample_prepared = data_clients.sample(frac =fractions[i]) 
    features= sample_prepared.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(sample_prepared[features], sample_prepared[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        logreg_1=LogisticRegression()
        logreg_1.fit(X_train,y_train)
        logreg_2=LogisticRegression()
        logreg_2.fit(X_train_scaled,y_train)
        pred_test_1 = logreg_1.predict(X_test)
        pred_test_2 = logreg_2.predict(X_test_scaled)
        accuracy.append([fractions[i], balanced_methods[l],'not_scaled: ', roc_auc_score(y_test, pred_test_1), 'scaled: ', roc_auc_score(y_test, pred_test_2)])
            

0.025


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_

0.05


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_

0.07


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_

In [36]:
accuracy

[[0.025,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.530056557519137,
  'scaled: ',
  0.5318518833272412],
 [0.025,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5320430107526882,
  'scaled: ',
  0.5335176651305683],
 [0.025,
  SMOTE(),
  'not_scaled: ',
  0.6240580044058157,
  'scaled: ',
  0.6239911616946742],
 [0.05,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5280802096680115,
  'scaled: ',
  0.529554868414773],
 [0.05,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5367955286446204,
  'scaled: ',
  0.5352170987941831],
 [0.05,
  SMOTE(),
  'not_scaled: ',
  0.6223339875753495,
  'scaled: ',
  0.6221444213675088],
 [0.07,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5242602052324618,
  'scaled: ',
  0.5233894015532431],
 [0.07,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5224477747399742,
  'scaled: ',
  0.5214301356626337],
 [0.07,
  SMOTE(),
  'not_scal

## *LinearSVC*

In [38]:
fractions = [.025, .05, .07]

oversample = RandomOverSampler(sampling_strategy=0.5)
undersample = RandomUnderSampler(sampling_strategy=0.5)
os= SMOTE()

balanced_methods = [oversample, undersample, os]
accuracy_scaled =[]
accuracy_not_scaled=[]
accuracy=[]

for i in range(len(fractions)):
    print(fractions[i])
    sample_prepared = data_clients.sample(frac =fractions[i]) 
    features= sample_prepared.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(sample_prepared[features], sample_prepared[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        tree_1=LinearSVC()
        tree_1.fit(X_train,y_train)
        tree_2=LinearSVC()
        tree_2.fit(X_train_scaled,y_train)
        pred_test_1 = tree_1.predict(X_test)
        pred_test_2 = tree_2.predict(X_test_scaled)
        accuracy.append([fractions[i], balanced_methods[l],'not_scaled: ', roc_auc_score(y_test, pred_test_1), 'scaled: ', roc_auc_score(y_test, pred_test_2)])
            

0.025


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.05


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.07


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [39]:
accuracy

[[0.025,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5115191007922942,
  'scaled: ',
  0.5255939081114889],
 [0.025,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5097461530587873,
  'scaled: ',
  0.5239423044116142],
 [0.025,
  SMOTE(),
  'not_scaled: ',
  0.5045140341715431,
  'scaled: ',
  0.6249702878604773],
 [0.05,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.50064134207467,
  'scaled: ',
  0.5255763869634296],
 [0.05,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5,
  'scaled: ',
  0.5160710708342381],
 [0.05,
  SMOTE(),
  'not_scaled: ',
  0.5744187124383582,
  'scaled: ',
  0.6185395983940044],
 [0.07,
  RandomOverSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5012830747551803,
  'scaled: ',
  0.5252974586096765],
 [0.07,
  RandomUnderSampler(sampling_strategy=0.5),
  'not_scaled: ',
  0.5169423342642095,
  'scaled: ',
  0.5238338622549487],
 [0.07,
  SMOTE(),
  'not_scaled: ',
  0.5885

## Temp Conclusion
- *In the case of knn, scaled data was better trained, as for the sample size, it is better not to take too large size.  I'll leave 0.04 of the entire sample and SMOTE() method for further tuning of model.*
- *In the case of DecisionTreeClassifier, scaled data was better trained, as for the sample size, it is better not to take too large.  I'll leave 0.05 of the entire sample and RandomUnderSampler() method for further tuning of model.*
- *In the case of LogRegression, not scaled data was better trained, as for the sample size, it is better not to take too large.  I'll leave 0.025 of the entire sample and SMOTE() method for further tuning of model.*
- *In the case of LinearSVC, scaled data was better trained, as for the sample size, it is better not to take too large.  I'll leave 0.025 of the entire sample and SMOTE() method for further tuning of model.*

## Selection of hyperparameters to increase the speed

### *KNN*

In [124]:
knn_sample = data_clients.sample(frac = 0.04) 

In [125]:
features= knn_sample.columns.tolist()[:-1]
target=['flag']

In [133]:
os = SMOTE(sampling_strategy=0.5)
X_resampled, y_resampled = os.fit_resample(knn_sample[features], knn_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [134]:
#Let's look at the different number of neighbors

In [135]:
score_list = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k, weights ='distance')
    knn.fit(X_train_scaled,y_train)
    pred_test = knn.predict(X_test_scaled)
    score_list.append([k, roc_auc_score(y_test, pred_test)])

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [136]:
score_df = pd.DataFrame(score_list, columns = ['k', 'roc_auc_score'])

In [137]:
score_df

Unnamed: 0,k,roc_auc_score
0,1,0.922175
1,3,0.906949
2,5,0.898316
3,7,0.893396
4,9,0.891106
5,11,0.889314
6,13,0.888297
7,15,0.887539
8,17,0.885656
9,19,0.884053


In [138]:
knn = KNeighborsClassifier(n_neighbors=3, weights ='distance')
knn.fit(X_train_scaled,y_train)

  return self._fit(X, y)


In [139]:
pred_train = knn.predict(X_train_scaled)
pred_test = knn.predict(X_test_scaled)

In [140]:
mse_train = mean_squared_error(y_train, pred_train)
mse_test = mean_squared_error(y_test, pred_test)
print(mse_train, mse_test)

0.001281497480201584 0.1166882649388049



Unfortunately, there is retraining here.



## *DecisionTreeClassifier*

In [113]:
dtc_sample = data_clients.sample(frac = 0.05) 

In [114]:
features= dtc_sample.columns.tolist()[:-1]
target=['flag']

In [115]:
under = RandomUnderSampler(sampling_strategy=0.8)
X_resampled, y_resampled = under.fit_resample(dtc_sample[features], dtc_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [116]:
param_grid={'min_samples_leaf' : [4, 10, 15],
            'max_depth' : [5, 15, 20], 
            'max_leaf_nodes':[100, 200, 300, 500]
    }

In [117]:
tree =RandomizedSearchCV(DecisionTreeClassifier(), param_distributions = param_grid, scoring='roc_auc', cv = 5, n_iter=10)

In [118]:
tree_grid = tree.fit(X_train_scaled,y_train)

In [119]:
tree_grid.best_params_

{'min_samples_leaf': 4, 'max_leaf_nodes': 100, 'max_depth': 5}

In [120]:
tree=DecisionTreeClassifier(min_samples_leaf = 4, max_leaf_nodes= 100, max_depth=5)
tree.fit(X_train_scaled,y_train)
pred_test = tree.predict(X_test_scaled)
roc_auc_score(y_test, pred_test)

0.5900412427600358

In [121]:
pred_train = tree.predict(X_train_scaled)
pred_test = tree.predict(X_test_scaled)

In [122]:
mse_train = mean_squared_error(y_train, pred_train)
mse_test = mean_squared_error(y_test, pred_test)
print(mse_train, mse_test)

0.3855028075548749 0.384238464679461


## *LogRegression*

In [202]:
logreg_sample = data_clients.sample(frac = 0.025) 

In [194]:
features= logreg_sample.columns.tolist()[:-1]
target=['flag']

In [203]:
os = SMOTE(sampling_strategy=0.6)
X_resampled, y_resampled = os.fit_resample(logreg_sample[features], logreg_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    

Let's look at the parameter C separately

In [204]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

In [205]:
cv = StratifiedKFold(n_splits=5)
best_score = -1

for C in np.arange(0.1, 8.1, 0.2):
    model = LogisticRegression(
        C=C,  
        random_state=42, 
    )
    score = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv).mean()
    if score > best_score:
        best_score = score
        best_C = C

print('best score:', best_score)
print('best params:', best_C)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

best score: 0.6236381838286431
best params: 7.300000000000001


In [206]:
param_grid={'multi_class' : ['auto', 'ovr', 'multinomial'],
            'penalty':['l1', 'l2', 'elasticnet'], 
            'solver' : ['lbfgs', 'liblinear', 'saga', 'newton-cg']
    }

In [207]:
log_rs=RandomizedSearchCV(LogisticRegression(C=7.3, class_weight='balanced'), param_distributions = param_grid, scoring='roc_auc', cv = 5)

In [208]:
log_grid = log_rs.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [210]:
log_grid.best_params_

{'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'auto'}

In [213]:
model_logreg=LogisticRegression(C=7.3, class_weight='balanced', penalty ='l1', solver= 'liblinear', multi_class='auto')
model_logreg.fit(X_train,y_train)
pred_test = model_logreg.predict(X_test)
roc_auc_score(y_test, pred_test)

  y = column_or_1d(y, warn=True)


0.5893809263892019

In [235]:
model_logreg.coef_.tolist()

[[0.006915742965860914,
  0.3077767391053613,
  0.14204926397151216,
  -0.0706771746423015,
  0.002790224877384856,
  -0.01644823903359222,
  0.02809483334973931,
  0.06256387937390212,
  -0.061995080402440755,
  0.15502270251178923,
  0.015254590383883639,
  -0.5033073171197268]]

In [233]:
features
#let's try to remove important feature 'pre_since_confirmed' and to delete 'enc_loans_credit_status'

['pre_since_opened',
 'pre_since_confirmed',
 'pre_pterm',
 'pre_fterm',
 'pre_till_pclose',
 'pre_till_fclose',
 'pre_loans_credit_limit',
 'pre_loans_credit_cost_rate',
 'pre_util',
 'enc_paym_5',
 'enc_paym_6',
 'enc_paym_7',
 'enc_paym_8',
 'enc_paym_9',
 'enc_paym_10',
 'enc_paym_11',
 'enc_paym_12',
 'enc_paym_13',
 'enc_paym_14',
 'enc_loans_credit_status']

## *LinearSVC*

In [214]:
svc_sample = data_clients.sample(frac = 0.025) 

In [215]:
features= svc_sample.columns.tolist()[:-1]
target=['flag']

In [223]:
os = SMOTE(sampling_strategy=0.3)
X_resampled, y_resampled = os.fit_resample(svc_sample[features], svc_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42) 
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [224]:
param_grid = { 'loss' : ['hinge', 'squared_hinge'],
    'C' : np.arange(0.1, 5.1, 0.5)
    
}

In [225]:
model_svc = RandomizedSearchCV(estimator=LinearSVC(class_weight='balanced'), param_distributions=param_grid, cv= 5)
model_svc.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [227]:
model_svc.best_params_

{'loss': 'hinge', 'C': 2.1}

In [229]:
model_svc=LinearSVC(class_weight='balanced', C=2.1, loss='hinge')
model_svc.fit(X_train_scaled,y_train)
pred_test = model_svc.predict(X_test_scaled)
roc_auc_score(y_test, pred_test)

  y = column_or_1d(y, warn=True)


0.6159328748138978

In [236]:
model_svc.coef_.tolist()

[[-0.22024775494968046,
  -0.22989438063522757,
  -0.12898602418205962,
  -0.19783823212783277,
  -0.03649460019959081,
  -0.3007632747339224,
  -0.03446740559550516,
  -0.176282904768111,
  -0.1134736166440285,
  -0.19817025494340143,
  0.5143275911977379,
  -0.0967283773960679,
  -0.0060806852388425375,
  0.07787176751638522,
  -0.019502261850232776,
  -0.07252527835849314,
  0.034014593069196235,
  -0.0020579942909394228,
  0.1491782660939198,
  -0.5140236995335575]]

In [238]:
features

['pre_since_opened',
 'pre_since_confirmed',
 'pre_pterm',
 'pre_fterm',
 'pre_till_pclose',
 'pre_till_fclose',
 'pre_loans_credit_limit',
 'pre_loans_credit_cost_rate',
 'pre_util',
 'enc_paym_5',
 'enc_paym_6',
 'enc_paym_7',
 'enc_paym_8',
 'enc_paym_9',
 'enc_paym_10',
 'enc_paym_11',
 'enc_paym_12',
 'enc_paym_13',
 'enc_paym_14',
 'enc_loans_credit_status']

### Temp Conculsion
It will not be possible to raise score, we will have to remove the features from the sample to improve the models.

In [243]:
data_clients_new = data_clients[['pre_since_opened',
 'pre_pterm',
 'pre_fterm',
 'pre_till_pclose',
 'pre_till_fclose',
 'pre_loans_credit_limit',
 'pre_loans_credit_cost_rate',
 'pre_util',
 'enc_paym_5',
 'enc_paym_6',
 'enc_paym_7',
 'enc_paym_8',
 'enc_paym_9',
 'enc_paym_10',
 'enc_paym_11',
 'enc_paym_12',
 'enc_paym_13',
 'enc_paym_14', 'flag']]

In [244]:
knn_sample=data_clients_new.sample(frac = 0.04) 
tree_sample=data_clients_new.sample(frac = 0.05) 
logreg_sample=data_clients_new.sample(frac = 0.025) 
svc_sample=data_clients_new.sample(frac = 0.025) 

## ONCE AGAIN THE SAME

## *KNN*

In [245]:
sampling_strategies = [.5, .1, .8]

accuracy=[]

for i in range(len(sampling_strategies)):
    print(sampling_strategies[i])
    oversample = RandomOverSampler(sampling_strategy = sampling_strategies[i])
    undersample = RandomUnderSampler(sampling_strategy = sampling_strategies[i])
    os= SMOTE(sampling_strategy = sampling_strategies[i])
    balanced_methods = [oversample, undersample, os]

    features= knn_sample.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(knn_sample[features], knn_sample[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        knn=KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train_scaled,y_train)
        pred_test = knn.predict(X_test_scaled)
        
        accuracy.append([sampling_strategies[i], balanced_methods[l],roc_auc_score(y_test, pred_test)])
            

0.5


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.1


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.8


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [246]:
accuracy

[[0.5, RandomOverSampler(sampling_strategy=0.5), 0.9573106470847268],
 [0.5, RandomUnderSampler(sampling_strategy=0.5), 0.5429652769434098],
 [0.5, SMOTE(sampling_strategy=0.5), 0.900960786034776],
 [0.1, RandomOverSampler(sampling_strategy=0.1), 0.7714526420007276],
 [0.1, RandomUnderSampler(sampling_strategy=0.1), 0.5147695112834444],
 [0.1, SMOTE(sampling_strategy=0.1), 0.7393554130896784],
 [0.8, RandomOverSampler(sampling_strategy=0.8), 0.958648809338785],
 [0.8, RandomUnderSampler(sampling_strategy=0.8), 0.566108008441157],
 [0.8, SMOTE(sampling_strategy=0.8), 0.8947940875715779]]

In [247]:
over = RandomOverSampler(sampling_strategy=0.1)

In [248]:
X_resampled, y_resampled = over.fit_resample(knn_sample[features], knn_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled,y_train)
pred_test = knn.predict(X_test_scaled)

  return self._fit(X, y)


In [249]:
roc_auc_score(y_test, pred_test)

0.7681790902115339

In [251]:
pred_train = knn.predict(X_train_scaled)
pred_test = knn.predict(X_test_scaled)
mse_train = mean_squared_error(y_train, pred_train)
mse_test = mean_squared_error(y_test, pred_test)
print(mse_train, mse_test)

0.03607604263889571 0.08668998310213384


In [252]:
os = SMOTE(sampling_strategy=0.1)
X_resampled, y_resampled = os.fit_resample(knn_sample[features], knn_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled,y_train)
pred_test = knn.predict(X_test_scaled)

  return self._fit(X, y)


In [253]:
roc_auc_score(y_test, pred_test)

0.7441267067554452

In [254]:
pred_train = knn.predict(X_train_scaled)
pred_test = knn.predict(X_test_scaled)
mse_train = mean_squared_error(y_train, pred_train)
mse_test = mean_squared_error(y_test, pred_test)
print(mse_train, mse_test)

0.03752026330009334 0.08044170236177152


## *DecisionTreeClassifier*

In [255]:
sampling_strategies = [.5, .1, .8]

accuracy=[]

for i in range(len(sampling_strategies)):
    print(sampling_strategies[i])
    oversample = RandomOverSampler(sampling_strategy = sampling_strategies[i])
    undersample = RandomUnderSampler(sampling_strategy = sampling_strategies[i])
    os= SMOTE(sampling_strategy = sampling_strategies[i])
    balanced_methods = [oversample, undersample, os]

    features= knn_sample.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(tree_sample[features], tree_sample[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        tree=DecisionTreeClassifier()
        tree.fit(X_train_scaled,y_train)
        pred_test = tree.predict(X_test_scaled)
        
        accuracy.append([sampling_strategies[i], balanced_methods[l],roc_auc_score(y_test, pred_test)])
            

0.5
0.1
0.8


In [256]:
accuracy

[[0.5, RandomOverSampler(sampling_strategy=0.5), 0.9735443985571933],
 [0.5, RandomUnderSampler(sampling_strategy=0.5), 0.5389985456523547],
 [0.5, SMOTE(sampling_strategy=0.5), 0.8773216605013838],
 [0.1, RandomOverSampler(sampling_strategy=0.1), 0.9093499040849921],
 [0.1, RandomUnderSampler(sampling_strategy=0.1), 0.5256958970106972],
 [0.1, SMOTE(sampling_strategy=0.1), 0.6997160028297691],
 [0.8, RandomOverSampler(sampling_strategy=0.8), 0.9745299829771521],
 [0.8, RandomUnderSampler(sampling_strategy=0.8), 0.5272481280727157],
 [0.8, SMOTE(sampling_strategy=0.8), 0.9014392622699843]]

In [258]:
os = SMOTE(sampling_strategy=0.1)
X_resampled, y_resampled = os.fit_resample(tree_sample[features], tree_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
tree=DecisionTreeClassifier()
tree.fit(X_train_scaled,y_train)
pred_test = tree.predict(X_test_scaled)
roc_auc_score(y_test, pred_test)

0.696149751576857

In [259]:
pred_train = tree.predict(X_train_scaled)
pred_test = tree.predict(X_test_scaled)
mse_train = mean_squared_error(y_train, pred_train)
mse_test = mean_squared_error(y_test, pred_test)
print(mse_train, mse_test)

0.0021597084786228126 0.12181943833636992


## *LogRegression*

In [263]:
sampling_strategies = [.5, .1, .8, .4, .2]

accuracy=[]

for i in range(len(sampling_strategies)):
    print(sampling_strategies[i])
    oversample = RandomOverSampler(sampling_strategy = sampling_strategies[i])
    undersample = RandomUnderSampler(sampling_strategy = sampling_strategies[i])
    os= SMOTE(sampling_strategy = sampling_strategies[i])
    balanced_methods = [oversample, undersample, os]

    features= logreg_sample.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(logreg_sample[features], logreg_sample[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        logreg=LogisticRegression()
        logreg.fit(X_train,y_train)
        pred_test = logreg.predict(X_test)
        
        accuracy.append([sampling_strategies[i], balanced_methods[l],roc_auc_score(y_test, pred_test)])
            

0.5


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

0.1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html

0.8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html

0.4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html

0.2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html

In [264]:
accuracy

[[0.5, RandomOverSampler(sampling_strategy=0.5), 0.5415099142091683],
 [0.5, RandomUnderSampler(sampling_strategy=0.5), 0.5484079541909449],
 [0.5, SMOTE(sampling_strategy=0.5), 0.5387504433255631],
 [0.1, RandomOverSampler(sampling_strategy=0.1), 0.5],
 [0.1, RandomUnderSampler(sampling_strategy=0.1), 0.5],
 [0.1, SMOTE(sampling_strategy=0.1), 0.5],
 [0.8, RandomOverSampler(sampling_strategy=0.8), 0.5946049989851914],
 [0.8, RandomUnderSampler(sampling_strategy=0.8), 0.5666552377220481],
 [0.8, SMOTE(sampling_strategy=0.8), 0.5917094460832678],
 [0.4, RandomOverSampler(sampling_strategy=0.4), 0.5187274765657623],
 [0.4, RandomUnderSampler(sampling_strategy=0.4), 0.5158677776861095],
 [0.4, SMOTE(sampling_strategy=0.4), 0.5153770182404451],
 [0.2, RandomOverSampler(sampling_strategy=0.2), 0.5],
 [0.2, RandomUnderSampler(sampling_strategy=0.2), 0.500764554520027],
 [0.2, SMOTE(sampling_strategy=0.2), 0.49990112549151167]]

OK, let's forget about logistic regression..

## *SVC*

In [265]:
sampling_strategies = [.5, .1, .8, .4]

accuracy=[]

for i in range(len(sampling_strategies)):
    print(sampling_strategies[i])
    oversample = RandomOverSampler(sampling_strategy = sampling_strategies[i])
    undersample = RandomUnderSampler(sampling_strategy = sampling_strategies[i])
    os= SMOTE(sampling_strategy = sampling_strategies[i])
    balanced_methods = [oversample, undersample, os]

    features= svc_sample.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(svc_sample[features], svc_sample[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        svc=LinearSVC(class_weight='balanced')
        svc.fit(X_train_scaled,y_train)
        pred_test = svc.predict(X_test_scaled)
        
        accuracy.append([sampling_strategies[i], balanced_methods[l],roc_auc_score(y_test, pred_test)])
            

0.5


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.1


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.8


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.4


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [266]:
accuracy

[[0.5, RandomOverSampler(sampling_strategy=0.5), 0.6011914331804209],
 [0.5, RandomUnderSampler(sampling_strategy=0.5), 0.6034103187807743],
 [0.5, SMOTE(sampling_strategy=0.5), 0.5986791270939261],
 [0.1, RandomOverSampler(sampling_strategy=0.1), 0.5976462123269074],
 [0.1, RandomUnderSampler(sampling_strategy=0.1), 0.6075368875636215],
 [0.1, SMOTE(sampling_strategy=0.1), 0.5861873472712462],
 [0.8, RandomOverSampler(sampling_strategy=0.8), 0.6034938910550697],
 [0.8, RandomUnderSampler(sampling_strategy=0.8), 0.5879542769786672],
 [0.8, SMOTE(sampling_strategy=0.8), 0.604716568685208],
 [0.4, RandomOverSampler(sampling_strategy=0.4), 0.5927046710669042],
 [0.4, RandomUnderSampler(sampling_strategy=0.4), 0.5909218359071693],
 [0.4, SMOTE(sampling_strategy=0.4), 0.6014156770977166]]

OK, let's forget about SVC too.

## Temp Conclusion
We need to try to train the remaining models so that they do not retrain. Maybe we'll remove more features from the sample.

## Our last hope are KNN and DecisionTreeClassifier

### KNN

In [273]:
knn_sample=data_clients.sample(frac = 0.05) 

In [274]:
sampling_strategies = [.5, .1, .8, .6]

accuracy=[]

for i in range(len(sampling_strategies)):
    print(sampling_strategies[i])
    oversample = RandomOverSampler(sampling_strategy = sampling_strategies[i])
    undersample = RandomUnderSampler(sampling_strategy = sampling_strategies[i])
    os= SMOTE(sampling_strategy = sampling_strategies[i])
    balanced_methods = [oversample, undersample, os]

    features= knn_sample.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(knn_sample[features], knn_sample[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        knn=KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train_scaled,y_train)
        pred_test = knn.predict(X_test_scaled)
        pred_train = knn.predict(X_train_scaled)
        pred_test = knn.predict(X_test_scaled)
        mse_train = mean_squared_error(y_train, pred_train)
        mse_test = mean_squared_error(y_test, pred_test)
        if mse_train >0  and abs(mse_train - mse_test)< 0.1:
            accuracy.append([sampling_strategies[i], balanced_methods[l],roc_auc_score(y_test, pred_test)])
            

0.5


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.1


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.8


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.6


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [275]:
accuracy

[[0.5, RandomOverSampler(sampling_strategy=0.5), 0.9526773178591702],
 [0.5, SMOTE(sampling_strategy=0.5), 0.8965606209998953],
 [0.1, RandomOverSampler(sampling_strategy=0.1), 0.7477196711181572],
 [0.1, RandomUnderSampler(sampling_strategy=0.1), 0.5081630757275933],
 [0.1, SMOTE(sampling_strategy=0.1), 0.7360083557910843],
 [0.8, RandomOverSampler(sampling_strategy=0.8), 0.9530183888309823],
 [0.8, SMOTE(sampling_strategy=0.8), 0.8884008174778948],
 [0.6, RandomOverSampler(sampling_strategy=0.6), 0.9526642299125756],
 [0.6, SMOTE(sampling_strategy=0.6), 0.8948808360917464]]

In [276]:
over = RandomOverSampler(sampling_strategy=0.1)

In [277]:
X_resampled, y_resampled = over.fit_resample(knn_sample[features], knn_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)    
        
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled,y_train)
pred_test = knn.predict(X_test_scaled)

  return self._fit(X, y)


In [278]:
pred_train = knn.predict(X_train_scaled)
pred_test = knn.predict(X_test_scaled)
mse_train = mean_squared_error(y_train, pred_train)
mse_test = mean_squared_error(y_test, pred_test)
print(mse_train, mse_test)

0.03754322540081735 0.08757976800477822


In [279]:
print(roc_auc_score(y_test, pred_test))

0.7697177358775402


## *TreeDecisionClassifier*

In [292]:
data_clients.columns

Index(['pre_since_opened', 'pre_since_confirmed', 'pre_pterm', 'pre_fterm',
       'pre_till_pclose', 'pre_till_fclose', 'pre_loans_credit_limit',
       'pre_loans_credit_cost_rate', 'pre_util', 'enc_paym_5', 'enc_paym_6',
       'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11',
       'enc_paym_12', 'enc_paym_13', 'enc_paym_14', 'enc_loans_credit_status',
       'flag'],
      dtype='object')

I decided to delete features that reflect not the planned deadlines, but the real ones.

In [304]:
data_clients_new = data_clients[[
'pre_fterm','pre_fterm',
     'pre_till_fclose', 
       'pre_loans_credit_cost_rate', 'pre_util', 'enc_paym_5', 'enc_paym_6',
       'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11',
       'enc_paym_12', 'enc_paym_13', 'enc_paym_14', 
       'flag']]

In [305]:
tree_sample=data_clients_new.sample(frac = 0.05) 

In [306]:
sampling_strategies = [.5, .1, .8, .6]

accuracy=[]

for i in range(len(sampling_strategies)):
    print(sampling_strategies[i])
    oversample = RandomOverSampler(sampling_strategy = sampling_strategies[i])
    undersample = RandomUnderSampler(sampling_strategy = sampling_strategies[i])
    os= SMOTE(sampling_strategy = sampling_strategies[i])
    balanced_methods = [oversample, undersample, os]

    features= tree_sample.columns.tolist()[:-1]
    target=['flag']
    
    for l in range(len(balanced_methods)):
        X_resampled, y_resampled = balanced_methods[l].fit_resample(tree_sample[features], tree_sample[target])
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)    
        
        scaler= StandardScaler()
        scaler.fit(X_train)
        X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
        X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
        tree=DecisionTreeClassifier()
        tree.fit(X_train_scaled,y_train)
        pred_test = tree.predict(X_test_scaled)
        pred_train = tree.predict(X_train_scaled)
        pred_test = tree.predict(X_test_scaled)
        mse_train = mean_squared_error(y_train, pred_train)
        mse_test = mean_squared_error(y_test, pred_test)
        if mse_train >0  and abs(mse_train - mse_test)< 0.1:
            accuracy.append([sampling_strategies[i], balanced_methods[l],roc_auc_score(y_test, pred_test)])
            

0.5
0.1
0.8
0.6


In [307]:
accuracy

[[0.5, RandomOverSampler(sampling_strategy=0.5), 0.8325813001790318],
 [0.5, SMOTE(sampling_strategy=0.5), 0.7671498935294563],
 [0.1, RandomOverSampler(sampling_strategy=0.1), 0.685906310364775],
 [0.1, RandomUnderSampler(sampling_strategy=0.1), 0.5091889596380933],
 [0.1, SMOTE(sampling_strategy=0.1), 0.5937493604241265],
 [0.8, RandomOverSampler(sampling_strategy=0.8), 0.8555901061775915],
 [0.8, SMOTE(sampling_strategy=0.8), 0.7983978999699082],
 [0.6, RandomOverSampler(sampling_strategy=0.6), 0.8424056234171319],
 [0.6, SMOTE(sampling_strategy=0.6), 0.7835214132150041]]

In [313]:
os = RandomOverSampler(sampling_strategy=0.8)
X_resampled, y_resampled = os.fit_resample(tree_sample[features], tree_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)    
        
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
tree=DecisionTreeClassifier()
tree.fit(X_train_scaled,y_train)
pred_test = tree.predict(X_test_scaled)

In [314]:
pred_train = tree.predict(X_train_scaled)
pred_test = tree.predict(X_test_scaled)
mse_train = mean_squared_error(y_train, pred_train)
mse_test = mean_squared_error(y_test, pred_test)
print(mse_train, mse_test)

0.13049538746906228 0.14940777258467253


In [315]:
roc_auc_score(y_test, pred_test)

0.8526430604275814

## Important Conclusion
The errors on the test and the traine samples are almost the same! This is what we need!

### Tuning of our best model

In [317]:
tree_sample=data_clients_new.sample(frac = 0.05) 

In [318]:
os = RandomOverSampler(sampling_strategy=0.8)
X_resampled, y_resampled = os.fit_resample(tree_sample[features], tree_sample[target])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)    
        
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled=pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled=pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    

In [322]:
param_grid={'min_samples_leaf' : [2, 4, 10, 15],
            'max_depth' : [2, 5, 15, 20], 
            'max_leaf_nodes':[100, 200, 300, 500, 10000]
    }

In [323]:
tree =RandomizedSearchCV(DecisionTreeClassifier(), param_distributions = param_grid, scoring='roc_auc', cv = 5, n_iter=10)
tree_grid = tree.fit(X_train_scaled,y_train)

In [324]:
tree_grid.best_params_

{'min_samples_leaf': 2, 'max_leaf_nodes': 300, 'max_depth': 20}

In [325]:
tree=DecisionTreeClassifier(min_samples_leaf = 2, max_leaf_nodes= 300, max_depth=20)
tree.fit(X_train_scaled,y_train)
pred_test = tree.predict(X_test_scaled)
roc_auc_score(y_test, pred_test)

0.6250099357284773

Well, apparently it's better not to touch anything.