In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # split data into training and validation data, for both features and target
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import xgboost as XGB
from sklearn.impute import SimpleImputer
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score 
from sklearn.compose import ColumnTransformer #cross validation
from sklearn.pipeline import Pipeline
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

# classification Logistic Regression,k-Nearest Neighbors,Decisiontrees,SupportVectorMachine,Naive Bayes
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load data
company_bankruptcy = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv') 

**Exploratory Data Analysis**

In [None]:
company_bankruptcy.head()

In [None]:
company_bankruptcy.info()

In [None]:
company_bankruptcy.describe()

In [None]:
company_bankruptcy.isnull().sum()
# no null value

In [None]:
# Choose target and features
y = company_bankruptcy['Bankrupt?']

X = company_bankruptcy.copy()
X.drop(['Bankrupt?'], axis = 1, inplace = True)

In [None]:
X.head()

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0, test_size = 0.2, shuffle=True, stratify = y)

In [None]:
# fit scaler on training data
norm = MinMaxScaler().fit(train_X)

# transform training data X_train_norm
train_X = norm.transform(train_X)

# transform testing dataabs X_val_norm
val_X = norm.transform(val_X)

In [None]:
#Decision Tree classifier 0.2677
def get_score():
    my_pipeline = Pipeline(steps=[
        ('preprocessor', SimpleImputer()),
        ('model', DecisionTreeClassifier(random_state=0))
    ])
    scores = cross_val_score(my_pipeline, X, y,
                                  cv=3,
                                  scoring='f1')
    return scores.mean()
get_score()

In [None]:
#Decision Tree classifier 0.2677, minmax perfomed better than simpleimputer 0.27440
def get_scored():
    my_pipeline = Pipeline(steps=[
        ('preprocessor', MinMaxScaler()),
        ('model', DecisionTreeClassifier(random_state=0))
    ])
    score = cross_val_score(my_pipeline, X, y,
                                  cv=45, n_jobs = -1,
                                  scoring='f1')
    return score.mean()
get_scored()

In [None]:
# svc, no difference on this
def get_svc():
    my_pipeline = Pipeline(steps=[
        ('preprocessor', SimpleImputer()),
        ('model', SVC(random_state=0))
    ])
    scores = cross_val_score(my_pipeline, X, y,
                                  cv=45,
                                  scoring='f1')
    return scores.mean()
get_svc()

In [None]:
# Naive bayes
def get_gnb():
    my_pipeline = Pipeline(steps=[
        ('preprocessor', SimpleImputer()),
        ('model', GaussianNB())
    ])
    scores = cross_val_score(my_pipeline, X, y,
                                  cv=3,
                                  scoring='f1')
    return scores.mean()
get_gnb()

In [None]:
# Naive bayes it also performed better on this, the higher the cv, the better the model 0.185219 50
def get_gnbs():
    my_pipeline = Pipeline(steps=[
        ('preprocessor', MinMaxScaler()),
        ('model', GaussianNB())
    ])
    scores = cross_val_score(my_pipeline, train_X, train_y,
                                  cv=200, n_jobs = -1,
                                  scoring='f1')
    return scores.mean()
get_gnbs()
#0.19823376855393035

In [None]:
# logistic  
def get_log():
    my_pipeline = Pipeline(steps=[
        ('preprocessor', MinMaxScaler()),
        ('model', LogisticRegression(solver='liblinear'))
    ])
    scores = cross_val_score(my_pipeline, X, y,
                                  cv=2, n_jobs = -1,
                                  scoring='f1')
    return scores.mean()
get_log()
#0.19976635514018692

In [None]:
# logistic regression , l1_ratio = 0.5 0.043
log_model = LogisticRegression(solver='liblinear')
# fit model
log_model.fit(train_X, train_y)
# predict on test set
log_pred = log_model.predict(val_X)
print('F-measure: %.3f' % f1_score(val_y, log_pred))

In [None]:
pd.DataFrame(log_pred).head()

In [None]:
# naive bayes 0.063
gnb2_model = GaussianNB()


# fit model
gnb2_model.fit(train_X, train_y)

# predict on test set
gnb2_pred = gnb2_model.predict(val_X)
print('F-measure: %.3f' % f1_score(val_y, gnb2_pred))

# without encoding 0.059

In [None]:
# svc 0.043
svc2_model = SVC()
# fit model
svc2_model.fit(train_X, train_y)
# predict on test set
svc2_pred = log_model.predict(val_X)
print('F-measure: %.3f' % f1_score(val_y, svc2_pred))

In [None]:
# Decision Tree 0.319
dtc2_model = DecisionTreeClassifier()
# fit model
dtc2_model.fit(train_X, train_y)
# predict on test set
dtc2_pred = dtc2_model.predict(val_X)
print('F-measure: %.3f' % f1_score(val_y, dtc2_pred))

In [None]:
# 0.333, 0.388
xgb_model = XGB.XGBClassifier()
# fit model
xgb_model.fit(train_X, train_y)
# predict on test set
xgb_pred = xgb_model.predict(val_X)
print('F-measure: %.3f' % f1_score(val_y, xgb_pred))

In [None]:
# xgb
def get_xgb():
    my_pipeline = Pipeline(steps=[
        ('preprocessor', MinMaxScaler()),
        ('model',XGB.XGBClassifier())
    ])
    scores = cross_val_score(my_pipeline, train_X, train_y,
                                  cv=2, n_jobs = -1,
                                  scoring='f1')
    return scores.mean()
get_xgb()
# XGB performed better than all using cross_val_score