### Credit Scoring 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#1 - Read data and information about it

In [None]:
import os
import numpy as np
from matplotlib import pyplot

import pandas as pd 

# tells matplotlib to embed plots within the notebook
%matplotlib inline

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data_train.csv')

In [None]:
df.head()

Unnamed: 0,label,Age,Language,Sex,Marital,Has_Credit,Field,Month_of_birth,Day_of_birth,Region,Number_of_credits,Linked_cards,INPS_mln_sum,INPS_yes_no,Score_level,Score_class,Score_point,Changed_phone_number
0,0,34,1,2,6,2,13,12,1,12,1,0,0.0,0,0,0,-,1
1,0,38,1,1,5,1,10,7,1,13,1,2,0.0,0,0,0,-,1
2,0,35,1,2,4,2,9,8,1,13,4,1,1.0,1,0,0,-,1
3,0,27,1,1,5,2,13,7,1,12,1,2,1.0,0,0,0,-,1
4,0,32,1,2,4,2,10,7,1,13,3,1,2.0,1,0,0,-,1


In [None]:
df.tail()

Unnamed: 0,label,Age,Language,Sex,Marital,Has_Credit,Field,Month_of_birth,Day_of_birth,Region,Number_of_credits,Linked_cards,INPS_mln_sum,INPS_yes_no,Score_level,Score_class,Score_point,Changed_phone_number
8702,1,38,1,2,4,1,16,12,2,9,1,1,3.0,1,0,0,0,0
8703,1,32,1,2,4,1,4,1,2,12,1,1,1.5,1,0,0,0,0
8704,1,24,1,1,7,1,5,10,1,4,1,1,0.0,0,0,0,0,0
8705,1,31,1,2,4,1,1,6,2,7,1,1,0.0,0,3,6,237,0
8706,1,40,1,2,4,1,5,4,1,2,1,1,0.0,0,3,6,263,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8707 entries, 0 to 8706
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   label                 8707 non-null   int64  
 1   Age                   8707 non-null   int64  
 2   Language              8707 non-null   int64  
 3   Sex                   8707 non-null   int64  
 4   Marital               8707 non-null   int64  
 5   Has_Credit            8707 non-null   int64  
 6   Field                 8707 non-null   int64  
 7   Month_of_birth        8707 non-null   int64  
 8   Day_of_birth          8707 non-null   int64  
 9   Region                8707 non-null   int64  
 10  Number_of_credits     8707 non-null   int64  
 11  Linked_cards          8707 non-null   int64  
 12  INPS_mln_sum          8707 non-null   float64
 13  INPS_yes_no           8707 non-null   int64  
 14  Score_level           8707 non-null   int64  
 15  Score_class          

###  data Define "Null" values


In [None]:
df.isnull().sum()

label                   0
Age                     0
Language                0
Sex                     0
Marital                 0
Has_Credit              0
Field                   0
Month_of_birth          0
Day_of_birth            0
Region                  0
Number_of_credits       0
Linked_cards            0
INPS_mln_sum            0
INPS_yes_no             0
Score_level             0
Score_class             0
Score_point             0
Changed_phone_number    0
dtype: int64

In [None]:
df.shape

(8707, 18)

2 - Data cleaning and preparation

In [None]:
df2 = df
df2.head()

Unnamed: 0,label,Age,Language,Sex,Marital,Has_Credit,Field,Month_of_birth,Day_of_birth,Region,Number_of_credits,Linked_cards,INPS_mln_sum,INPS_yes_no,Score_level,Score_class,Score_point,Changed_phone_number
0,0,34,1,2,6,2,13,12,1,12,1,0,0.0,0,0,0,-,1
1,0,38,1,1,5,1,10,7,1,13,1,2,0.0,0,0,0,-,1
2,0,35,1,2,4,2,9,8,1,13,4,1,1.0,1,0,0,-,1
3,0,27,1,1,5,2,13,7,1,12,1,2,1.0,0,0,0,-,1
4,0,32,1,2,4,2,10,7,1,13,3,1,2.0,1,0,0,-,1


2.2. the data is given in some columns "-", we replace it with NaN

In [None]:
df2 = df2.replace({'-':np.nan})
df2.shape

(8707, 18)

2.3. Integer is the number of data objects in the "object" memory

In [None]:
columns  = df2.columns

for c in list(columns):
  df2[c] = df2[c].astype('float64')

2.4. Analysis of column values ​​obtained as blank - NaN

In [None]:
def missing_values_table(mydf):
        
        mis_val = mydf.isnull().sum()

        mis_val_percent = 100 * mydf.isnull().sum() / len(mydf)

        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

        mis_val_table_ren_columns = mis_val_table.rename(
        
        columns = {0 : 'Values ​​are missing', 1 : '% What is the percentage of the full column'})
        
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% What is the percentage of the full column', ascending=False).round(1)
       
        
        return mis_val_table_ren_columns

In [None]:
missing_values_table(df2)

2.5. Missing - Remove column greater than 30% of Null values

In [None]:
missing_df = missing_values_table(df2);
missing_columns = list(missing_df[missing_df['% What is the percentage of the full column'] >6].index)
print('We remove% d columns.' % len(missing_columns))

In [None]:
df2 = df2.drop(columns = list(missing_columns))

In [None]:
df2.shape

# 3.2. We can copy the data to a new DataFrame so as not to lose it.

In [None]:
df3=df2

4 - Analyze the graphical appearance of the data (EDA)

In [None]:
import seaborn as sns

sns.distplot(df3['label'])

4.2. Korelasia view

In [1]:
pyplot.subplots(figsize=(20,15))
sns.heatmap(df3.corr())

NameError: ignored

In [None]:
# Create correlation matrix
df3.corr()

5 - Train the Linear Regeression Model

In [None]:
y = df3['label']

In [None]:
y.head()

In [None]:
x = df3.drop(['label'], axis = 1, inplace=False)

#inplace = False means the drop column will be less than DF3

In [None]:
# instantiating the random over sampler 
ros = RandomOverSampler()
# resampling X, y
x_ros, y_ros = ros.fit_resample(x, y)
# new class distribution 
print(Counter(y_ros))

## 5.3. Create a Model ( SVC Model)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC()

In [None]:
model.fit(x_ros,y_ros)

### 6.3. Calculate the probability for 2020 and see the clear difference.

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data_test.csv')
y_test = data['label']
data = data.drop(['Score_point'], axis = 1, inplace=False)
X_test = data.drop(['label'], axis = 1, inplace=False)

In [None]:
# instantiating the random over sampler 
ros = RandomOverSampler()
# resampling X, y
x_ros, y_ros = ros.fit_resample(X_test, y_test)
# new class distribution 
print(Counter(y_ros))

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print(accuracy_score(y_test,predictions))

In [None]:
print("Mean squared error: %.2f" % np.mean((predictions - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % model.score(X_test, y_test))