## Company Bankruptcy Prediction

In [1]:
import numpy as np 
import pandas as pd 

In [43]:
df=pd.read_csv("../data/train.csv", low_memory=False)

In [3]:
train = df.apply(pd.to_numeric, errors='coerce')
train.head()

Unnamed: 0,id,forecasting period,net_profit_over_total_assets,total_liabilities_over_total_assets,working_capital_over_total_assets,current_assets_over_short_term_liabilities,cash_plus_short_term_securities_plus_receivables_minus_short_term_liabilities_over_operating_expenses_minus_depreciation_times_365,retained_earnings_over_total_assets,EBIT_over_total_assets,book_value_of_equity_over_total_liabilities,...,sales_minus_cost_of_products_sold_over_sales,current_assets_minus_inventory_minus_short_term_liabilities_over_sales_minus_gross_profit_minus_depreciation,total_costs_overtotal_sales,long_term_liabilities_over_equity,sales_over_inventory,sales_over_receivables,short_term_liabilities_times_365_over_sales,sales_over_short_term_liabilities,sales_over_fixed_assets,class
0,0,4,0.141939,0.450292,0.021422,1.052987,-28.529084,0.141934,0.172629,0.990337,...,0.079572,0.318286,0.927687,0.106818,12.941797,10.044585,64.7137,5.626894,3.937064,1
1,1,4,0.14756,0.23484,0.708335,4.016018,119.445611,0.45709,0.18607,3.206445,...,0.095218,0.195943,0.907056,-2.9e-05,233.301688,2.747416,36.967435,9.813419,40.567739,0
2,2,1,-0.024542,0.311544,0.067831,1.255247,-15.019923,-0.657003,-0.024537,2.209,...,0.109035,-0.035634,0.929825,0.055331,8.589991,5.164738,117.92404,3.098273,1.239711,0
3,3,2,0.159317,0.777017,0.207431,1.26731,-4.892918,-5e-06,0.159321,0.288177,...,0.102971,0.714528,0.937895,-7.7e-05,9.928221,5.912786,123.75261,2.946008,130.208321,0
4,4,4,0.004404,0.577419,-0.104435,0.786225,-116.911618,0.056561,0.004249,0.692166,...,0.016446,0.011019,0.979425,0.221335,5.099541,6.89013,166.76198,2.189705,1.740353,0


In [4]:
train.columns

Index(['id', 'forecasting period', 'net_profit_over_total_assets',
       'total_liabilities_over_total_assets',
       'working_capital_over_total_assets',
       'current_assets_over_short_term_liabilities',
       'cash_plus_short_term_securities_plus_receivables_minus_short_term_liabilities_over_operating_expenses_minus_depreciation_times_365',
       'retained_earnings_over_total_assets', 'EBIT_over_total_assets',
       'book_value_of_equity_over_total_liabilities',
       'sales_over_total_assets', 'equity_over_total_assets',
       'gross_profit_plus_extraordinary_items_plus_financial_expenses_over_total_assets',
       'gross_profit_over_short_term_liabilities',
       'gross_profit_plus_depreciation_over_sales',
       'gross_profit_plus_interest_over_total_assets',
       'total_liabilities_times_365_over_gross_profit_plus_depreciation',
       'gross_profit_plus_depreciation_over_total_liabilities',
       'total_assets_over_total_liabilities', 'gross_profit_over_total_asse

### Exploratory Analysis for Variables

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25121 entries, 0 to 25120
Data columns (total 67 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   id                                                                                                                                  25121 non-null  int64  
 1   forecasting period                                                                                                                  25121 non-null  int64  
 2   net_profit_over_total_assets                                                                                                        25118 non-null  float64
 3   total_liabilities_over_total_assets                                              

#### Dependent Variable

In [6]:
sum(train["class"].isna())

0

In [7]:
sum(train["class"].isnull())

0

So clearly there is no missing values in the dependent variable.

In [8]:
train["class"].unique() ## checking for unique values

array([1, 0], dtype=int64)

In [9]:
train.groupby(['class'])[['class']].count()

Unnamed: 0_level_0,class
class,Unnamed: 1_level_1
0,23995
1,1126


In [10]:
round(len(train["class"][train["class"] == 1])/len(train["class"]), 4) ## Percentage of Bankruptcy Cases

0.0448

In [11]:
round(len(train["class"][train["class"] == 1])/len(train["class"][train["class"] == 0]), 4) ## Odds Ratio

0.0469

The total number of Bankruptcy cases is 1126 and it comprises 0.0448 percentage of the whole data.

### Missing value imputation

In [12]:
train.isna().sum().sort_values(ascending=False)

current_assets_minus_inventories_over_long_term_liabilities    11153
sales_n_over_sales_n_minus_1                                    3911
profit_on_operating_activities_over_financial_expenses          1600
sales_over_inventory                                            1282
net_profit_over_inventory                                       1279
                                                               ...  
equity_over_total_assets                                           3
working_capital                                                    1
id                                                                 0
forecasting period                                                 0
class                                                              0
Length: 67, dtype: int64

In [13]:
for i in train:
    train.loc[train.loc[:,i].isnull(),i]=train.loc[:,i].mean()

In [14]:
train.isna().sum().sort_values(ascending=False)

id                                                                     0
EBITDA_profit_on_operating_activities_minus_depreciation_over_sales    0
profit_on_sales_over_total_assets                                      0
total_sales_over_total_assets                                          0
current_assets_minus_inventories_over_long_term_liabilities            0
                                                                      ..
profit_on_operating_activities_over_financial_expenses                 0
working_capital_over_fixed_assets                                      0
logarithm_of_total_assets                                              0
total_liabilities_minus_cash_over_sales                                0
class                                                                  0
Length: 67, dtype: int64

We can see that almost all of the columns has missing values. So before we proceed with the analysis we have to impute these values.

In [15]:
train.describe()

Unnamed: 0,id,forecasting period,net_profit_over_total_assets,total_liabilities_over_total_assets,working_capital_over_total_assets,current_assets_over_short_term_liabilities,cash_plus_short_term_securities_plus_receivables_minus_short_term_liabilities_over_operating_expenses_minus_depreciation_times_365,retained_earnings_over_total_assets,EBIT_over_total_assets,book_value_of_equity_over_total_liabilities,...,sales_minus_cost_of_products_sold_over_sales,current_assets_minus_inventory_minus_short_term_liabilities_over_sales_minus_gross_profit_minus_depreciation,total_costs_overtotal_sales,long_term_liabilities_over_equity,sales_over_inventory,sales_over_receivables,short_term_liabilities_times_365_over_sales,sales_over_short_term_liabilities,sales_over_fixed_assets,class
count,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,...,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0,25121.0
mean,12560.0,2.608574,0.046883,0.611913,0.098553,6.285753,-91.25936,-0.067728,0.147535,15.153162,...,-0.770713,0.098693,4.429418,1.597135,322.5273,14.315634,739.4522,9.835042,77.756289,0.044823
std,7251.952392,1.063099,1.821587,5.933092,5.907783,339.445391,18264.0,6.87271,5.834939,610.876764,...,72.865215,10.38034,382.93932,152.43206,15651.62,120.69805,56187.32,154.435943,2667.820214,0.206919
min,0.0,1.0,-256.889998,-72.162006,-479.959994,-0.044851,-1670200.0,-508.409992,-189.559992,-141.41022,...,-8534.599415,-979.250019,-1.63182,-327.970143,-0.04066151,-12.6554,-2336500.0,-0.367905,-10677.001615,0.0
25%,6280.0,2.0,0.003354,0.271085,0.016667,1.039445,-51.06407,-1e-05,0.005652,0.423195,...,0.008373,0.015153,0.875789,-2e-06,5.797468,4.542071,41.66776,3.105896,2.218706,0.0
50%,12560.0,3.0,0.049838,0.476985,0.191632,1.557344,-1.874916,3e-06,0.060084,1.054305,...,0.052521,0.122504,0.951085,0.005201,10.62085,6.749773,71.16951,5.133577,4.460347,0.0
75%,18840.0,4.0,0.13141,0.693518,0.401742,2.784023,51.05802,0.085011,0.153183,2.606133,...,0.128879,0.293659,0.993312,0.242058,24.69057,10.652195,117.5783,8.767945,10.631476,0.0
max,25120.0,4.0,52.651999,480.960001,22.769001,53433.00044,1034100.0,322.199997,649.230003,53431.999474,...,293.158392,552.639985,59671.999668,23853.000122,2137800.0,12296.00052,7276000.0,23453.999855,294769.993913,1.0


### Train and validation split

In [16]:
X = train.iloc[:, 1:-1]
y = train.iloc[:,  -1]

In [17]:
X.head()

Unnamed: 0,forecasting period,net_profit_over_total_assets,total_liabilities_over_total_assets,working_capital_over_total_assets,current_assets_over_short_term_liabilities,cash_plus_short_term_securities_plus_receivables_minus_short_term_liabilities_over_operating_expenses_minus_depreciation_times_365,retained_earnings_over_total_assets,EBIT_over_total_assets,book_value_of_equity_over_total_liabilities,sales_over_total_assets,...,working_capital,sales_minus_cost_of_products_sold_over_sales,current_assets_minus_inventory_minus_short_term_liabilities_over_sales_minus_gross_profit_minus_depreciation,total_costs_overtotal_sales,long_term_liabilities_over_equity,sales_over_inventory,sales_over_receivables,short_term_liabilities_times_365_over_sales,sales_over_short_term_liabilities,sales_over_fixed_assets
0,4.0,0.141939,0.450292,0.021422,1.052987,-28.529084,0.141934,0.172629,0.990337,1.087356,...,800.542408,0.079572,0.318286,0.927687,0.106818,12.941797,10.044585,64.7137,5.626894,3.937064
1,4.0,0.14756,0.23484,0.708335,4.016018,119.445611,0.45709,0.18607,3.206445,1.093404,...,24093.000493,0.095218,0.195943,0.907056,-2.9e-05,233.301688,2.747416,36.967435,9.813419,40.567739
2,1.0,-0.024542,0.311544,0.067831,1.255247,-15.019923,-0.657003,-0.024537,2.209,0.825578,...,851.1792,0.109035,-0.035634,0.929825,0.055331,8.589991,5.164738,117.92404,3.098273,1.239711
3,2.0,0.159317,0.777017,0.207431,1.26731,-4.892918,-5e-06,0.159321,0.288177,2.283532,...,2577.61484,0.102971,0.714528,0.937895,-7.7e-05,9.928221,5.912786,123.75261,2.946008,130.208321
4,4.0,0.004404,0.577419,-0.104435,0.786225,-116.911618,0.056561,0.004249,0.692166,1.013605,...,-11887.079853,0.016446,0.011019,0.979425,0.221335,5.099541,6.89013,166.76198,2.189705,1.740353


In [18]:
y.head()

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: class, dtype: float64

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=341)

### Test data preprocessing

In [44]:
df1 = pd.read_csv("../data/test.csv", low_memory=False)
test = df1.apply(pd.to_numeric, errors='coerce')
test.isna().sum().sort_values(ascending=False)

current_assets_minus_inventories_over_long_term_liabilities    5283
sales_n_over_sales_n_minus_1                                   1840
profit_on_operating_activities_over_financial_expenses          773
sales_over_inventory                                            602
net_profit_over_inventory                                       600
                                                               ... 
sales_over_total_assets                                           1
long_term_liabilities_over_equity                                 1
forecasting period                                                0
working_capital                                                   0
id                                                                0
Length: 66, dtype: int64

In [21]:
for i in test:
    test.loc[test.loc[:,i].isnull(),i]=test.loc[:,i].mean()

In [22]:
test.isna().sum().sort_values(ascending=False)

id                                                                            0
EBITDA_profit_on_operating_activities_minus_depreciation_over_total_assets    0
operating_expenses_over_total_liabilities                                     0
profit_on_sales_over_total_assets                                             0
total_sales_over_total_assets                                                 0
                                                                             ..
net_profit_plus_depreciation_over_total_liabilities                           0
profit_on_operating_activities_over_financial_expenses                        0
working_capital_over_fixed_assets                                             0
logarithm_of_total_assets                                                     0
sales_over_fixed_assets                                                       0
Length: 66, dtype: int64

In [23]:
test1 = test.iloc[:, 1:]
test1.head()

Unnamed: 0,forecasting period,net_profit_over_total_assets,total_liabilities_over_total_assets,working_capital_over_total_assets,current_assets_over_short_term_liabilities,cash_plus_short_term_securities_plus_receivables_minus_short_term_liabilities_over_operating_expenses_minus_depreciation_times_365,retained_earnings_over_total_assets,EBIT_over_total_assets,book_value_of_equity_over_total_liabilities,sales_over_total_assets,...,working_capital,sales_minus_cost_of_products_sold_over_sales,current_assets_minus_inventory_minus_short_term_liabilities_over_sales_minus_gross_profit_minus_depreciation,total_costs_overtotal_sales,long_term_liabilities_over_equity,sales_over_inventory,sales_over_receivables,short_term_liabilities_times_365_over_sales,sales_over_short_term_liabilities,sales_over_fixed_assets
0,4.0,0.087054,0.504697,-0.213442,0.527201,-945.320442,2.456644e-07,0.091917,0.982293,1.642851,...,-19987.124701,0.322386,0.175775,0.706656,0.010392,28.872646,12.709311,100.292643,3.644498,2.154215
1,2.0,-0.21039,0.631828,0.228799,1.36995,-63.351273,-0.03844551,-0.258109,0.582886,1.670711,...,18.621606,-0.126736,-0.571445,1.152916,4e-05,3.070083,6.488415,135.027409,2.700548,10.951484
2,1.0,0.755349,0.208157,0.525567,3.525034,95.93687,0.007305173,0.755342,3.804564,2.604779,...,1440.011406,0.337905,0.953913,0.7087,0.000308,34.481702,5.197885,29.178703,12.513042,9.782107
3,2.0,0.090151,0.539555,0.220522,1.409651,-8.77594,-0.3165242,0.090153,0.852674,2.567658,...,503.854614,0.220679,0.195811,0.776341,0.000179,10.17822,5.473611,76.575956,4.771108,10.639495
4,4.0,0.023989,0.002572,0.987142,385.569944,1704.227987,1.438131e-05,0.02951,388.589564,0.210381,...,12841.024909,0.051816,0.024058,0.884155,4e-06,5.369218,14.109097,4.340005,81.967149,20.421013


Now that we have imputed both train and test using KNN we can start the model building. 

### Model Building

#### Logistic Regression

In [24]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
from sklearn.feature_selection import SelectKBest
k = 12
prep = SelectKBest(k=k)
X_train = prep.fit_transform(X_train, y_train)
X_test = prep.fit_transform(X_test, y_test)

In [28]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(n_jobs = -1, random_state=0, max_iter = 100000, solver = 'saga')
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
print(f1_score(y_train, y_train_pred))
print(f1_score(y_test, y_test_pred))

0.004944375772558715
0.01238390092879257


In [None]:
from sklearn.model_selection import GridSearchCV
cv = 3
hyperparameters = {'penalty': [ 'l2','none'], 'C': [0.1, 1, 10, 100]}
regression_grid = GridSearchCV(lr, hyperparameters, scoring='accuracy', cv=cv, verbose=0, n_jobs=-1)
regression_grid.fit(X_train, y_train)

In [68]:
print('Regression Classifier Best Parameters')
print(regression_grid.best_params_)

Regression Classifier Best Parameters
{'C': 0.1, 'penalty': 'l2'}


##### Model evaluation

In [69]:
y_test_pred = regression_grid.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7217    0]
 [ 319    1]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      7217
           1       1.00      0.00      0.01       320

    accuracy                           0.96      7537
   macro avg       0.98      0.50      0.49      7537
weighted avg       0.96      0.96      0.94      7537



### SVM

In [70]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [71]:
from sklearn.feature_selection import SelectKBest
k = 10
prep = SelectKBest(k=k)
X_train = prep.fit_transform(X_train, y_train)
X_test = prep.fit_transform(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, C = 0.01, max_iter = 100000)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print(f1_score(y_train, y_train_pred))
print(f1_score(y_test, y_test_pred)) 

In [73]:
from sklearn.model_selection import GridSearchCV
cv = 5
hyperparameters = {'C': [0.01, 0.1, 1, 10, 100]}
regression_grid = GridSearchCV(LinearSVC(random_state=92, max_iter=10000), hyperparameters, scoring='accuracy', cv=cv, verbose=0, n_jobs=-1)
regression_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LinearSVC(max_iter=10000, random_state=92),
             n_jobs=-1, param_grid={'C': [0.01, 0.1, 1, 10, 100]},
             scoring='accuracy')

In [76]:
print('Regression Classifier Best Parameters')
print(regression_grid.best_params_)

Regression Classifier Best Parameters
{'C': 0.01}


In [77]:
y_test_pred = regression_grid.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7216    1]
 [ 318    2]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      7217
           1       0.67      0.01      0.01       320

    accuracy                           0.96      7537
   macro avg       0.81      0.50      0.50      7537
weighted avg       0.95      0.96      0.94      7537



### Decision tree

In [24]:
X = train.iloc[:, 1:-1]

from sklearn.model_selection import train_test_split
y = train[['class']].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

dt = DecisionTreeClassifier(random_state=92, max_depth=2)
dt.fit(X_train, y_train)
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)
print(f1_score(y_train, y_train_pred))
print(f1_score(y_test, y_test_pred))

0.403770620581304
0.4110429447852761


In [146]:
dt = DecisionTreeClassifier(random_state=0, max_depth=6)
dt.fit(X_train, y_train)
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)
print(f1_score(y_train, y_train_pred))
print(f1_score(y_test, y_test_pred))

0.6046114432109307
0.4890829694323144


In [73]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7191   26]
 [ 208  112]]
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      7217
         1.0       0.81      0.35      0.49       320

    accuracy                           0.97      7537
   macro avg       0.89      0.67      0.74      7537
weighted avg       0.97      0.97      0.96      7537



### ANN

In [18]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing

In [19]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 50, 25, 2), random_state=0, beta_1 = 0.99) # alpha is for regularization (L2 regularization)
mlp.fit(X_train, y_train)
y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)
print('Training accuracy:'.format(), f1_score(y_train, y_train_pred))
print('Test accuracy:'.format(), f1_score(y_test, y_test_pred))

Training accuracy: 0.8376830044557607
Test accuracy: 0.43174603174603177


In [85]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 50, 25, 2), random_state=0, beta_1 = 0.99) # alpha is for regularization (L2 regularization)
mlp.fit(X_train, y_train)
y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)
print('Training accuracy:'.format(), f1_score(y_train, y_train_pred))
print('Test accuracy:'.format(), f1_score(y_test, y_test_pred))

Training accuracy: 0.8376830044557607
Test accuracy: 0.43174603174603177


In [87]:
scaler = preprocessing.StandardScaler().fit(test1)
test2 = scaler.transform(test1)
grid_predictions = mlp.predict(test2)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0.0    11306
1.0     1068
dtype: int64

In [88]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_ann.csv")

In [18]:
from sklearn.model_selection import GridSearchCV
cv = 3
hyperparameters = {'alpha': [0.0001, 0.001, 0.01, 0.1]}
mlp_grid = GridSearchCV(mlp, hyperparameters, scoring='accuracy', cv=cv, verbose=0, n_jobs=-1)
mlp_grid.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2),
                                     max_iter=5000, random_state=0),
             n_jobs=-1, param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1]},
             scoring='accuracy')

In [19]:
print('Regression Classifier Best Parameters:')
print(mlp_grid.best_params_)

Regression Classifier Best Parameters:
{'alpha': 0.01}


In [23]:
y_test_pred = mlp_grid.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7169   48]
 [ 240   80]]
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      7217
         1.0       0.62      0.25      0.36       320

    accuracy                           0.96      7537
   macro avg       0.80      0.62      0.67      7537
weighted avg       0.95      0.96      0.95      7537



In [188]:
### Ridge classification

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier().fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[4792    2]
 [ 230    1]]
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98      4794
         1.0       0.33      0.00      0.01       231

    accuracy                           0.95      5025
   macro avg       0.64      0.50      0.49      5025
weighted avg       0.93      0.95      0.93      5025



In [184]:
#### SGD

import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-3,class_weight='balanced'))
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[3621 1173]
 [ 109  122]]
              precision    recall  f1-score   support

         0.0       0.97      0.76      0.85      4794
         1.0       0.09      0.53      0.16       231

    accuracy                           0.74      5025
   macro avg       0.53      0.64      0.50      5025
weighted avg       0.93      0.74      0.82      5025



In [186]:
from sklearn.linear_model import PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
clf.fit(X_train, y_train)
PassiveAggressiveClassifier(random_state=0)

y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[4547  247]
 [ 205   26]]
              precision    recall  f1-score   support

         0.0       0.96      0.95      0.95      4794
         1.0       0.10      0.11      0.10       231

    accuracy                           0.91      5025
   macro avg       0.53      0.53      0.53      5025
weighted avg       0.92      0.91      0.91      5025



In [181]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
LinearDiscriminantAnalysis()
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[4789    5]
 [ 227    4]]
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98      4794
         1.0       0.44      0.02      0.03       231

    accuracy                           0.95      5025
   macro avg       0.70      0.51      0.50      5025
weighted avg       0.93      0.95      0.93      5025



In [179]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)
QuadraticDiscriminantAnalysis()
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[ 338 4456]
 [  13  218]]
              precision    recall  f1-score   support

         0.0       0.96      0.07      0.13      4794
         1.0       0.05      0.94      0.09       231

    accuracy                           0.11      5025
   macro avg       0.50      0.51      0.11      5025
weighted avg       0.92      0.11      0.13      5025



In [177]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVC
clf = make_pipeline(StandardScaler(), NuSVC(nu=0.01,class_weight='balanced'))
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[1621 3173]
 [  48  183]]
              precision    recall  f1-score   support

         0.0       0.97      0.34      0.50      4794
         1.0       0.05      0.79      0.10       231

    accuracy                           0.36      5025
   macro avg       0.51      0.57      0.30      5025
weighted avg       0.93      0.36      0.48      5025



In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
clf = make_pipeline(StandardScaler(),LinearSVC(random_state=0, tol=1e-5,max_iter=1000000,class_weight='balanced'))
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
gpc = GaussianProcessClassifier(random_state=0).fit(X_train, y_train)
y_test_pred = gpc.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [12]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7201   16]
 [ 318    2]]
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      7217
         1.0       0.11      0.01      0.01       320

    accuracy                           0.96      7537
   macro avg       0.53      0.50      0.49      7537
weighted avg       0.92      0.96      0.94      7537



In [299]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
clf = GaussianNB()
clf.fit(X_train, y_train)
GaussianNB()
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[ 250 6967]
 [  14  306]]
              precision    recall  f1-score   support

         0.0       0.95      0.03      0.07      7217
         1.0       0.04      0.96      0.08       320

    accuracy                           0.07      7537
   macro avg       0.49      0.50      0.07      7537
weighted avg       0.91      0.07      0.07      7537



In [17]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[5764 1453]
 [ 176  144]]
              precision    recall  f1-score   support

         0.0       0.97      0.80      0.88      7217
         1.0       0.09      0.45      0.15       320

    accuracy                           0.78      7537
   macro avg       0.53      0.62      0.51      7537
weighted avg       0.93      0.78      0.85      7537



In [102]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7023  194]
 [ 155  165]]
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98      7217
         1.0       0.46      0.52      0.49       320

    accuracy                           0.95      7537
   macro avg       0.72      0.74      0.73      7537
weighted avg       0.96      0.95      0.95      7537



In [103]:
grid_predictions = clf.predict(test1)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0.0    11757
1.0      617
dtype: int64

In [77]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_dt.csv")

In [23]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
clf = BaggingClassifier(base_estimator=SVC(class_weight='balanced'),n_estimators=10, random_state=0).fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[4715 2502]
 [ 141  179]]
              precision    recall  f1-score   support

         0.0       0.97      0.65      0.78      7217
         1.0       0.07      0.56      0.12       320

    accuracy                           0.65      7537
   macro avg       0.52      0.61      0.45      7537
weighted avg       0.93      0.65      0.75      7537



In [247]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf =  RandomForestClassifier(n_estimators=25, random_state=0, bootstrap = False)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7198   19]
 [ 209  111]]
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      7217
         1.0       0.85      0.35      0.49       320

    accuracy                           0.97      7537
   macro avg       0.91      0.67      0.74      7537
weighted avg       0.97      0.97      0.96      7537



In [248]:
grid_predictions = clf.predict(test1)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0.0    12139
1.0      235
dtype: int64

In [249]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_bt.csv")

In [28]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
ExtraTreesClassifier(random_state=0)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7199   18]
 [ 306   14]]
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      7217
         1.0       0.44      0.04      0.08       320

    accuracy                           0.96      7537
   macro avg       0.70      0.52      0.53      7537
weighted avg       0.94      0.96      0.94      7537



In [85]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0, learning_rate = 0.2, n_estimators = 3500).fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7196   21]
 [ 148  172]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7217
         1.0       0.89      0.54      0.67       320

    accuracy                           0.98      7537
   macro avg       0.94      0.77      0.83      7537
weighted avg       0.98      0.98      0.97      7537



In [90]:
grid_predictions = clf.predict(test1)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0.0    12075
1.0      299
dtype: int64

In [91]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_gb2.csv")

In [86]:
from sklearn.ensemble import GradientBoostingClassifier

clf1 = GradientBoostingClassifier(random_state=0, learning_rate = 0.2, n_estimators = 3500, min_samples_leaf = 9).fit(X_train, y_train)
y_test_pred = clf1.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7203   14]
 [ 149  171]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7217
         1.0       0.92      0.53      0.68       320

    accuracy                           0.98      7537
   macro avg       0.95      0.77      0.83      7537
weighted avg       0.98      0.98      0.98      7537



In [108]:
from sklearn.ensemble import GradientBoostingClassifier

clf1 = GradientBoostingClassifier(random_state=0, n_estimators = 3500, min_samples_leaf = 8).fit(X_train, y_train)
y_test_pred = clf1.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7203   14]
 [ 152  168]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7217
         1.0       0.92      0.53      0.67       320

    accuracy                           0.98      7537
   macro avg       0.95      0.76      0.83      7537
weighted avg       0.98      0.98      0.98      7537



In [87]:
grid_predictions = clf1.predict(test1)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0.0    12085
1.0      289
dtype: int64

In [88]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_gb1.csv")

In [109]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0, n_estimators = 600, min_samples_leaf = 8).fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7213    4]
 [ 155  165]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7217
         1.0       0.98      0.52      0.67       320

    accuracy                           0.98      7537
   macro avg       0.98      0.76      0.83      7537
weighted avg       0.98      0.98      0.98      7537



In [110]:
grid_predictions = clf.predict(test1)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0.0    12122
1.0      252
dtype: int64

In [111]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_gb3.csv")

In [107]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0, n_estimators = 600, min_samples_leaf = 8, max_depth = 5).fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7209    8]
 [ 157  163]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7217
         1.0       0.95      0.51      0.66       320

    accuracy                           0.98      7537
   macro avg       0.97      0.75      0.83      7537
weighted avg       0.98      0.98      0.97      7537



In [90]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.datasets import load_iris
clf = HistGradientBoostingClassifier().fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7213    4]
 [ 165  155]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7217
         1.0       0.97      0.48      0.65       320

    accuracy                           0.98      7537
   macro avg       0.98      0.74      0.82      7537
weighted avg       0.98      0.98      0.97      7537



In [28]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.datasets import load_iris
clf = HistGradientBoostingClassifier(random_state=252, l2_regularization = 0.1).fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7212    5]
 [ 157  163]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7217
         1.0       0.97      0.51      0.67       320

    accuracy                           0.98      7537
   macro avg       0.97      0.75      0.83      7537
weighted avg       0.98      0.98      0.98      7537



In [34]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(n_jobs = -1, random_state=0, max_iter = 1, solver = 'saga')
clf2 = RandomForestClassifier(n_estimators=25, random_state=0, bootstrap = False)
clf3 = GaussianNB()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)

eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft')
eclf2 = eclf2.fit(X_train, y_train)

eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[2,1,1],flatten_transform=True)
eclf3 = eclf3.fit(X_train, y_train)

In [295]:
y_test_pred = eclf2.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7154   63]
 [ 176  144]]
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98      7217
         1.0       0.70      0.45      0.55       320

    accuracy                           0.97      7537
   macro avg       0.84      0.72      0.77      7537
weighted avg       0.96      0.97      0.97      7537



In [245]:
grid_predictions = eclf2.predict(test1)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0.0    12029
1.0      345
dtype: int64

In [246]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_vot.csv")

In [301]:
from sklearn.multiclass import OutputCodeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = OutputCodeClassifier(estimator=RandomForestClassifier(n_estimators=25, random_state=0, bootstrap = False)).fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7198   19]
 [ 209  111]]
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      7217
         1.0       0.85      0.35      0.49       320

    accuracy                           0.97      7537
   macro avg       0.91      0.67      0.74      7537
weighted avg       0.97      0.97      0.96      7537



### XGBoost

In [24]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [22]:
xg = xgb.XGBClassifier(verbosity=0, colsample_bytree = 0.9, use_label_encoder=False, n_estimators = 175, 
                      scale_pos_weight = 75).fit(X_train, y_train)

In [22]:
y_test_pred = xg.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[7180   37]
 [ 111  209]]
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.99      7217
         1.0       0.85      0.65      0.74       320

    accuracy                           0.98      7537
   macro avg       0.92      0.82      0.86      7537
weighted avg       0.98      0.98      0.98      7537



In [366]:
grid_predictions = xg.predict(test1)
pred=pd.DataFrame(grid_predictions, index = test.id)
pred.value_counts()

0    11993
1      381
dtype: int64

In [367]:
import os
os.chdir(r'../data')

pred.to_csv(r"submission_xgb.csv")