In [None]:
"""
When a bank receives a loan application, 
based on the applicant’s profile the bank has to make a decision 
regarding whether to go ahead with the loan approval or not. 
Two types of risks are associated with the bank’s decision.

If the applicant is a good credit risk, i.e. is likely to repay the loan, 
then not approving the loan to the person results in a loss of business to the bank.

If the applicant is a bad credit risk, i.e. is not likely to repay the loan, 
then approving the loan to the person results in a financial loss to the bank.

The predictors that may potentially have any influence on Creditability:

Account Balance: 
    No account (1), 
    None (No balance) (2), 
    Below 200 DM (3),
    200 DM or Above (4)

Payment Status: 
    0 : Delayed
    1 : Other Credits, 
    2 : Paid Up,
    3 : No Problem with Current Credits,
    4 : Previous Credits Paid

Savings/Stock Value: 
    None, 
    Below 100 DM, 
    [100, 1000] DM, 
    Above 1000 DM

Employment Length: 
    Below 1 year (including unemployed), 
    [1, 4), 
    [4, 7), 
    Above 7

Sex/Marital Status: 
    Male Divorced/Single, 
    Male Married/Widowed, 
    Female 

No of Credits at this bank: 1, More than 1

Guarantor: None, Yes

Concurrent Credits: Other Banks or Dept Stores, None

ForeignWorker variable may be dropped from the study

Purpose of Credit: New car, Used car, Home Related

"""

In [5]:
#importing pandas and reading the file directly
import pandas as pd
#seperated with ; (mentioned with the csv)
dataset = pd.read_csv('/kaggle/input/datasets/rishabkumarkggl/german-csv/german.csv', sep = ';')
dataset.head()

Unnamed: 0,Creditability,Account_Balance,Duration_of_Credit_monthly,Payment_Status_of_Previous_Credit,Purpose,Credit_Amount,Value_Savings_Stocks,Length_of_current_employment,Instalment_per_cent,Sex_Marital_Status,...,Duration_in_Current_address,Most_valuable_available_asset,Age_years,Concurrent_Credits,Type_of_apartment,No_of_Credits_at_this_Bank,Occupation,No_of_dependents,Telephone,Foreign_Worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,1,1
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,1,1
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,1,1
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,1,2
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,1,2


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Creditability                      1000 non-null   int64
 1   Account_Balance                    1000 non-null   int64
 2   Duration_of_Credit_monthly         1000 non-null   int64
 3   Payment_Status_of_Previous_Credit  1000 non-null   int64
 4   Purpose                            1000 non-null   int64
 5   Credit_Amount                      1000 non-null   int64
 6   Value_Savings_Stocks               1000 non-null   int64
 7   Length_of_current_employment       1000 non-null   int64
 8   Instalment_per_cent                1000 non-null   int64
 9   Sex_Marital_Status                 1000 non-null   int64
 10  Guarantors                         1000 non-null   int64
 11  Duration_in_Current_address        1000 non-null   int64
 12  Most_valuable_availab

In [7]:
'''
replacing 'Account_Balance' numeric values with string equivalents:
1 : 'No Account', 
2 : 'None',
3 : 'Below 200 DM',
4 : '200 DM or Above'
'''

dataset = dataset.replace(
    {'Account_Balance' : {
        1 : 'No Account', 2 : 'None',3: 'Below 200 DM',4:'200 DM or Above'}
    }
)

In [36]:
import plotly.express as px

account_bal_counts = dataset['Account_Balance'].value_counts().reset_index()
account_bal_counts.columns = ['Account_Balance_Category', 'Frequency']
px.bar(account_bal_counts,x='Account_Balance_Category', y='Frequency')

In [35]:
'''
Payment_Status_of_Previous_Credit' : 
        0:'Delayed',
        1 : 'Other Credits', 
        2 : 'Paid Up',
        3: 'No Problem with Current Credits',
        4:'Previous Credits Paid'
'''

dataset = dataset.replace(
    {'Payment_Status_of_Previous_Credit' : {
        0:'Delayed',1 : 'Other Credits', 2 : 'Paid Up',
        3: 'No Problem with Current Credits',
        4:'Previous Credits Paid'}
    }
)

In [38]:
prev_cred_payment_satus_count = dataset['Payment_Status_of_Previous_Credit'].value_counts().reset_index()
prev_cred_payment_satus_count.columns = ['Payment_Status_of_Previous_Credit', 'Frequency']

px.bar(prev_cred_payment_satus_count, x='Payment_Status_of_Previous_Credit', y='Frequency')

In [39]:
dataset = dataset.replace(
    {'Value_Savings_Stocks' : {
        1 : 'None', 
        2 : 'Below 100 DM',
        3: '[100, 500)',
        4:'[500, 1000)',
        5:'Above 1000'}
    }
)

In [40]:
value_saving_stocks_count = dataset.Value_Savings_Stocks.value_counts().reset_index()
value_saving_stocks_count.columns = ['Value_Savings_Stocks','Frequency']
px.bar(value_saving_stocks_count, x='Value_Savings_Stocks', y='Frequency')


In [41]:
dataset = dataset.replace(
    {'Length_of_current_employment' : {
        1 : 'Unemployed', 
        2 : '<1 Year',
        3 : '[1, 4)',
        4 :'[4, 7)',
        5 :'Above 7'}
    }
)

In [44]:

current_employment_count = dataset.Length_of_current_employment.value_counts().reset_index()
current_employment_count.columns = ['Length_of_current_employment','Frequency']
px.bar(current_employment_count, x='Length_of_current_employment', y='Frequency')

In [45]:
dataset = dataset.replace(
    {'Instalment_per_cent' : {
        1 : 'Above 35%', 
        2 : '(25%, 35%)',
        3: '[20%, 25%)',
        4:'Below 20%'}
    }
)

In [48]:
instalment_count = dataset.Instalment_per_cent.value_counts().reset_index()
instalment_count.columns = ['Instalment_per_cent','Frequency']
px.pie(instalment_count, names='Instalment_per_cent',values='Frequency')


In [49]:
dataset = dataset.replace(
    {'Occupation' : {
        1 : 'Unemployed, unskilled', 
        2 : 'Unskilled Permanent Resident',
        3: 'Skilled',
        4:'Executive'}
    }
)

In [50]:
occupation_count = dataset.Occupation.value_counts().reset_index()
occupation_count.columns = ['Occupation','Frequency']
px.pie(occupation_count, values='Frequency', names='Occupation')


In [51]:
dataset = dataset.replace(
    {'Sex_Marital_Status' : {
        1 : 'Male, Divorced', 
        2 : 'Male, Single',
        3: 'Male, Married/Widowed',
        4: 'Female'}
    }
)

In [52]:
sex_marital_count = dataset.Sex_Marital_Status.value_counts().reset_index()
sex_marital_count.columns = ['Sex_Marital_Status','Frequency']
px.pie(sex_marital_count,values='Frequency', names='Sex_Marital_Status')


In [53]:
dataset = dataset.replace(
    {'Duration_in_Current_address' : {
        1 : '<1 Year', 
        2 : '[1, 4)',
        3: '[4, 7)',
        4:'Above 7'}
    }
)

In [54]:
current_add_duration_count = dataset.Duration_in_Current_address.value_counts().reset_index()
current_add_duration_count.columns = ['Duration_in_Current_address','Frequency']
px.bar(current_add_duration_count, x='Duration_in_Current_address',y='Frequency')


In [55]:
dataset = dataset.replace(
    {'Type_of_apartment' : {
        1 : 'Free',
        2 : 'Rented',
        3: 'Owned'}
    }
)

In [56]:
appartment_type_count = dataset.Type_of_apartment.value_counts().reset_index()
appartment_type_count.columns = ['Type_of_apartment','Frequency']
px.pie(appartment_type_count, values='Frequency', names='Type_of_apartment')

In [58]:
dataset = dataset.replace(
    {'Most_valuable_available_asset' : {
        1 : 'None',
        2 : 'Car',
        3: 'Life Insurance',
        4:'Real Estate'}
    }
)

In [59]:

most_valuable_asset_count = dataset.Most_valuable_available_asset.value_counts().reset_index()
most_valuable_asset_count.columns=['Most_valuable_available_asset','Frequency']
px.pie(most_valuable_asset_count, names='Most_valuable_available_asset', values='Frequency')

In [60]:
dataset = dataset.replace(
    {'No_of_Credits_at_this_Bank' : {
        1 : '1',
        2 : '2 or 3',
        3: '4 or 5',
        4:'Above 6'}
    }
)

In [61]:
total_credits_count = dataset.No_of_Credits_at_this_Bank.value_counts().reset_index()
total_credits_count.columns = ['No_of_Credits_at_this_Bank','Frequency']
px.bar(total_credits_count, x='No_of_Credits_at_this_Bank',y='Frequency')

In [62]:
dataset = dataset.replace(
    {'Guarantors' : {
        1 : 'None',
        2 : 'Co-applicant',
        3: 'Guarantor'}
    }
)

In [65]:

guarantors_count = dataset.Guarantors.value_counts().reset_index()
guarantors_count.columns = ['Guarantors','Frequency']
#px.bar(guarantors_count, x='Guarantors',y='Frequency')
px.pie(guarantors_count[1:], names='Guarantors',values='Frequency')

In [67]:
dataset = dataset.replace(
    {'Concurrent_Credits' : {
        1 : 'Other Banks', 
        2 : 'Dept. Store',
        3: 'None'}
    }
)

In [69]:

concur_credits_count = dataset.Concurrent_Credits.value_counts().reset_index()
concur_credits_count.columns = ['Concurrent_Credits','Frequency']
#px.bar(concur_credits_count, x='Concurrent_Credits',y='Frequency')
px.pie(concur_credits_count[1:], names='Concurrent_Credits',values='Frequency')

In [48]:
dataset = dataset.replace(
    {'No_of_dependents' : {
        1 : '3 or More',
        2 : 'Less than 3'}
    }
)

3 or More      845
Less than 3    155
Name: No_of_dependents, dtype: int64

In [71]:

dependents_count = dataset.No_of_dependents.value_counts().reset_index()
dependents_count.columns = ['No_of_dependents','Frequency']
px.pie(dependents_count,values='No_of_dependents',names='Frequency')


In [56]:
dataset = dataset.replace(
    {'Telephone' : {
        1 : 'No',
        2 : 'Yes'}
    }
)

No     596
Yes    404
Name: Telephone, dtype: int64

In [72]:

telephone_count = dataset.Telephone.value_counts().reset_index()
telephone_count.columns = ['Telephone','Frequency']
px.pie(telephone_count, values='Frequency', names = 'Telephone')


In [74]:
dataset = dataset.replace(
    {'Foreign_Worker' : {
        1 : 'No', 
        2 : 'Yes'}
    }
)

In [75]:


Foreign_Worker_count = dataset.Foreign_Worker.value_counts().reset_index()
Foreign_Worker_count.columns = ['Foreign_Worker','Frequency']
px.pie(Foreign_Worker_count, values='Frequency', names = 'Foreign_Worker')

In [77]:
dataset = dataset.replace(
    {'Purpose' : {
        0:'Other',
        1 : 'New Car', 
        2 : 'Used Car',
        3:'Furniture',
        4:'Radio/TV',
        5:'Appliances',
        6:'Repair',
        8:'Vacation',
        9:'Retraining',
        10:'Business'}
    }
)

In [78]:


Purpose_count = dataset.Purpose.value_counts().reset_index()
Purpose_count.columns = ['Purpose','Frequency']
px.bar(Purpose_count, y='Frequency', x = 'Purpose')

In [61]:
# dataset.to_csv('dataset_for_eda.csv')