In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression     
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier         
from sklearn.ensemble import RandomForestClassifier    
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score, confusion_matrix

In [None]:
data = pd.read_csv('data.csv',delimiter=';')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.hist(figsize=(10,10)); # распределение значений у признаков

In [None]:
# матрица корреляции признаков 
plt.subplots(figsize=(10,10))
sns.heatmap(data.corr(),square=True, annot=True)
plt.show()

In [None]:
data.order_id.value_counts() # есть дубликаты

In [None]:
data[data.order_id.duplicated()].shape #количество дубликатов
data = data.drop_duplicates(subset='order_id') # удаляем дубликаты

### 1. closed_cred_sum - сумма, которую клиент выплатил по ранее взятым и закрытым кредитам

In [None]:
data.closed_cred_sum.describe()

In [None]:
len(data[data.closed_cred_sum.isnull() == True]) # пропусков нет

In [None]:
# визуально оценим наличие выбросов
sns.scatterplot(x=data.order_id,y=data.closed_cred_sum);

In [None]:
data = data.query('closed_cred_sum < 20000000') #удалим выбросы

In [None]:
data.closed_cred_sum = StandardScaler().fit_transform(data[['closed_cred_sum']]) #нормализуем данные

### 2. closed_creds - количество закрытых кредитов

In [None]:
data.closed_creds.describe()

In [None]:
len(data[data.closed_creds.isnull() == True]) #пропусков нет

In [None]:
sns.scatterplot(x=data.order_id,y=data.closed_creds);

In [None]:
data = data.query('closed_creds < 200') #уберем выбросы 

In [None]:
data.closed_creds = StandardScaler().fit_transform(data[['closed_creds']]) # нормализуем данные

### 3. active_cred_sum_overdue - суммарная задолженность по активным кредитам

In [None]:
data.active_cred_sum_overdue.describe()

In [None]:
len(data[data.active_cred_sum_overdue.isnull() == True]) # пропусков нет

In [None]:
sns.scatterplot(x=data.order_id,y=data.active_cred_sum_overdue);

In [None]:
data = data.query('active_cred_sum_overdue<10000000') #уберем выбросы

In [None]:
data.active_cred_sum_overdue = StandardScaler().fit_transform(data[['active_cred_sum_overdue']])

### 4. active_cred_sum - сумма всех активных кредитов

In [None]:
data.active_cred_sum.describe()

In [None]:
sns.scatterplot(x=data.order_id,y=data.active_cred_sum);

In [None]:
sum(data.active_cred_sum.isnull() == True) # пропусков нет

In [None]:
data = data.query('active_cred_sum < 7000000') #убираем выбросы

In [None]:
data.active_cred_sum = StandardScaler().fit_transform(data[['active_cred_sum']]) #нормализуем данные 

### 5. active_cred_day_overdue - суммарная просрочка в днях по всем активным кредитам

In [None]:
data.active_cred_day_overdue.describe()

In [None]:
sns.scatterplot(x=data.order_id,y=data.active_cred_day_overdue);

In [None]:
sum(data.active_cred_day_overdue.isnull() == True) #пропусков нет

In [None]:
data = data.query('active_cred_day_overdue<18000') #удаляем выбросы

In [None]:
data.active_cred_day_overdue = StandardScaler().fit_transform(data[['active_cred_day_overdue']]) #нормализуем данные

### 6. active_cred_max_overdue - максимальная сумма просроченной задолженности по активным кредитам

In [None]:
data.active_cred_max_overdue.describe()

In [None]:
sns.scatterplot(x=data.order_id,y=data.active_cred_max_overdue);

In [None]:
sum(data.active_cred_max_overdue.isnull()==True) # пропусков нет

In [None]:
data = data.query('active_cred_max_overdue < 4000000') #удаляем выбросы

In [None]:
data.active_cred_max_overdue = StandardScaler().fit_transform(data[['active_cred_max_overdue']])

### 7. age

In [None]:
data.age.describe()

In [None]:
sns.scatterplot(x=data.order_id,y=data.age);

### 8. gender

In [None]:
data.gender.value_counts()

In [None]:
data.gender.hist();

### 9. region

In [None]:
data.region.value_counts()

### 10. month_income

In [None]:
data.month_income.describe()

In [None]:
sum(data.month_income.isnull())

In [None]:
data.month_income.value_counts()

In [None]:
income_0 = len(data[data.month_income == 0])/len(data)

print('Без указания заработной платы : {}%'.format(round(income_0*100,1)))

In [None]:
# заменим нулевую заработную плату медианным значением
data.month_income = np.where(data.month_income == 0, data.month_income.median(),data.month_income)

In [None]:
sns.scatterplot(x=data.order_id,y=data.month_income);

In [None]:
data = data.query('month_income < 200000') #удалим выбросы

In [None]:
data.month_income = StandardScaler().fit_transform(data[['month_income']]) #нормализуем данные 

### 11. first_loan - сумма, которую клиент хочет получить в кредит (из заявки)

In [None]:
data.first_loan.describe()

In [None]:
sum(data.first_loan.isnull())

In [None]:
data.dropna(inplace=True) # удаляем строки с пропусками

In [None]:
sns.scatterplot(x=data.order_id,y=data.first_loan);

In [None]:
data.first_loan = StandardScaler().fit_transform(data[['first_loan']]) #нормализуем данные 

### 12. first_days_quant - срок, на которую клиент хочет взять кредит 

In [None]:
data.first_days_quant.describe()

In [None]:
sns.scatterplot(x=data.order_id, y=data.first_days_quant);

In [None]:
sum(data.first_days_quant.isnull()) #пропусков нет

In [None]:
data.first_days_quant = StandardScaler().fit_transform(data[['first_days_quant']])

### 13. loan_cost_all - одобренная клиенту сумма кредита 

In [None]:
data.loan_cost_all.describe()

In [None]:
sum(data.loan_cost_all.isnull()) # пропусков нет

In [None]:
sns.scatterplot(x=data.order_id,y=data.loan_cost_all);

In [None]:
data.loan_cost_all = StandardScaler().fit_transform(data[['loan_cost_all']])

### 14. order_date - дата подачи заявки на получение кредита

In [None]:
print(f'Данные представлены за период с {data.order_date.min()} по {data.order_date.max()}')

### 15. cost_all - сумма всех выплат по одобренному кредиту

In [None]:
data.cost_all.describe()

In [None]:
sum(data.cost_all.isnull()) # пропусков нет

In [None]:
sns.scatterplot(x=data.order_id, y=data.cost_all);

In [None]:
data = data.query('cost_all < 150000') #удалим выбросы 

In [None]:
data.cost_all = StandardScaler().fit_transform(data[['cost_all']])

### 16. expert - решение, которое принято по заявке (0 - одобрено, 1 - отклонено)

In [None]:
data.expert.hist()
data.expert.value_counts(normalize=True)
# наблюдений с классом отколнено в 3 раза больше, чем наблюдений с классом одобрено 

# Выбор модели

In [None]:
X = data.drop(columns=['order_id','client_id','order_date','region','expert'])
y = data.expert

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)
clf_name = []
clf_score = []

## RandomForestClassifier

In [None]:
random_forest_clf = RandomForestClassifier(random_state=42, class_weight= 'balanced_subsample')
random_forest_clf.fit(X_train,y_train)
rf_pred = random_forest_clf.predict(X_test)
random_forest_clf_score = random_forest_clf.score(X_test,y_test)
print(f'RandomForestClassifier - {random_forest_clf_score}')
clf_name.append('RandomForestClassifier')
clf_score.append(random_forest_clf_score)

In [None]:
data_feat = pd.DataFrame({'feature':X_test.columns, \
                          'feature_importances':random_forest_clf.feature_importances_}). \
                            sort_values(by='feature_importances',ascending=False)

In [None]:
sns.barplot(x='feature_importances',y='feature',data=data_feat)

## SVM

In [None]:
svm_clf = svm.SVC()
svm_clf.fit(X_train,y_train)
svm_score = svm_clf.score(X_test,y_test)
svm_pred = svm_clf.predict(X_test)
print(f'SVM - {svm_score}')
clf_name.append('SVM')
clf_score.append(svm_score)

## GradientBoostingClassifier

In [None]:
grad_clf = GradientBoostingClassifier(random_state=42,learning_rate=0.01)
grad_clf.fit(X_train,y_train)
grad_predict = grad_clf.predict(X_test)
grad_score = grad_clf.score(X_test,y_test)
print(f'GradientBoostingClassifier - {grad_score}')
clf_name.append('GradientBoostingClassifier')
clf_score.append(grad_score)

## MLPClassifier

In [None]:
mlp_clf = MLPClassifier(solver='adam', alpha=1e-5, activation='identity',random_state=42)
mlp_clf.fit(X_train,y_train)
mlp_predict = mlp_clf.predict(X_test)
mlp_clf_score = mlp_clf.score(X_test,y_test)
print(f'MLPClassifier - {mlp_clf_score}')
clf_name.append('MLPClassifier')
clf_score.append(mlp_clf_score)

## Naive Bayes 

In [None]:
bayes_clf = GaussianNB()
bayes_clf.fit(X_train,y_train)
bayes_score = bayes_clf.score(X_test,y_test)
print(f'Naive Bayes - {bayes_score}')
clf_name.append('Naive Bayes')
clf_score.append(bayes_score)

## SGDClassifier

In [None]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train,y_train)
sgd_clf_score = sgd_clf.score(X_test,y_test)
print(f'SGDClassifier - {sgd_clf_score}')
clf_name.append('SGDClassifier')
clf_score.append(sgd_clf_score)

## DecisionTreeClassifier  

In [None]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train,y_train)
# tree_clf_pred = tree_clf.predict(X_test)
tree_clf_score = tree_clf.score(X_test,y_test)
print(f'DecisionTreeClassifier - {tree_clf_score}')
clf_name.append('DecisionTreeClassifier')
clf_score.append(tree_clf_score)

## LogisticRegression

In [None]:
log_reg_clf = LogisticRegression(random_state=42,max_iter=400)
log_reg_clf.fit(X_train,y_train)
log_reg_pred = log_reg_clf.predict(X_test)
log_reg_clf_score = log_reg_clf.score(X_test,y_test)
print(f'LogisticRegression - {log_reg_clf_score}')
clf_name.append('LogisticRegression')
clf_score.append(log_reg_clf_score)

## XGBoost 

In [None]:
clf_xgb = xgb.XGBClassifier(alpha=0.001)
clf_xgb.fit(X_train,y_train)
xgb_predict = clf_xgb.predict(X_test)
# clf_xgb_score = clf_xgb.score(X_test,y_test)
# print(f'XGBoost - {clf_xgb_score}')
clf_name.append('XGBoost')
clf_score.append(clf_xgb_score)

In [None]:
res_data = pd.DataFrame({'clf':clf_name,'clf_score':clf_score}).sort_values(by='clf_score',ascending=False)
res_data

In [None]:
# Метрики качества
accuracy = accuracy_score(y_test,xgb_predict)
precision = precision_score(y_test, xgb_predict)
recall = recall_score(y_test, xgb_predict)
f1_score_ = f1_score(y_test,xgb_predict)
confusion_matrix_ = confusion_matrix(y_test,xgb_predict)

print(f'accuracy - {accuracy}')
print(f'precision - {precision}')
print(f'recall - {recall}')
print(f'f1_score - {f1_score_}')
print(confusion_matrix_)