| Column           | Description                               | Type        | Notes                               |
| ---------------- | ----------------------------------------- | ----------- | ----------------------------------- |
| Status\_Checking | Status of checking account                | Categorical | Values: A11, A12, A13, A14          |
| Duration         | Duration of credit (months)               | Numeric     | Range: 4–72                         |
| CreditAmount     | Loan amount                               | Numeric     | Range: 250–18424                    |
| Purpose          | Loan purpose                              | Categorical | Values: car, radio, education, etc. |
| Age              | Age in years                              | Numeric     | 19–75                               |
| ...              | ...                                       | ...         | ...                                 |
| Default          | Target variable (1=default, 0=no default) | Binary      | Target                              |


In [35]:


import pandas as pd

#load data
df = pd.read_csv("../data/raw/german.data", 
                 delim_whitespace=True,  # handles multiple spaces
                 header=None)

print(df.shape)
print("Head")
print(df.head())
print("\nInfo")
print(df.info())
print("\nDescribe")
print(df.describe())


(1000, 21)
Head
    0   1    2    3     4    5    6   7    8     9   ...    11  12    13  \
0  A11   6  A34  A43  1169  A65  A75   4  A93  A101  ...  A121  67  A143   
1  A12  48  A32  A43  5951  A61  A73   2  A92  A101  ...  A121  22  A143   
2  A14  12  A34  A46  2096  A61  A74   2  A93  A101  ...  A121  49  A143   
3  A11  42  A32  A42  7882  A61  A74   2  A93  A103  ...  A122  45  A143   
4  A11  24  A33  A40  4870  A61  A73   3  A93  A101  ...  A124  53  A143   

     14 15    16 17    18    19 20  
0  A152  2  A173  1  A192  A201  1  
1  A152  1  A173  1  A191  A201  2  
2  A152  1  A172  2  A191  A201  1  
3  A153  1  A173  2  A191  A201  1  
4  A153  2  A173  2  A191  A201  2  

[5 rows x 21 columns]

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1000 non-null   object
 1   1       1000 non-null   int64 
 2   2       1000 non-null

  df = pd.read_csv("../data/raw/german.data",


In [36]:
columns = [
    "Status_of_existing_checking_account",
    "Duration_in_month",
    "Credit_history",
    "Purpose",
    "Credit_amount",
    "Savings_account_bonds",
    "Present_employment_since",
    "Installment_rate_in_percentage_of_disposable_income",
    "Personal_status_and_sex",
    "Other_debtors_guarantors",
    "Present_residence_since",
    "Property",
    "Age_in_years",
    "Other_installment_plans",
    "Housing",
    "Number_of_existing_credits_at_this_bank",
    "Job",
    "Number_of_people_being_liable_to_provide_maintenance_for",
    "Telephone",
    "Foreign_worker",
    "Target"   # last one
]

df.columns = columns


In [37]:
purpose_map = {
    'A40': 'Car (new)',
    'A41': 'Car (used)',
    'A42': 'Furniture',
    'A43': 'Radio/TV',
    'A44': 'Appliances',
    'A45': 'Repairs',
    'A46': 'Education',
    'A47': 'Retraining',
    'A48': 'Business',
    'A49': 'Other'
}

df['Purpose'] = df['Purpose'].map(purpose_map)


In [38]:
print(df.columns)


Index(['Status_of_existing_checking_account', 'Duration_in_month',
       'Credit_history', 'Purpose', 'Credit_amount', 'Savings_account_bonds',
       'Present_employment_since',
       'Installment_rate_in_percentage_of_disposable_income',
       'Personal_status_and_sex', 'Other_debtors_guarantors',
       'Present_residence_since', 'Property', 'Age_in_years',
       'Other_installment_plans', 'Housing',
       'Number_of_existing_credits_at_this_bank', 'Job',
       'Number_of_people_being_liable_to_provide_maintenance_for', 'Telephone',
       'Foreign_worker', 'Target'],
      dtype='object')


In [39]:

df['Target'].value_counts()


Target
1    700
2    300
Name: count, dtype: int64

In [40]:
# Replace 1 with 0, and 2 with 1
df["Target"] = df["Target"].replace({1: 0, 2: 1})

# quick check
print(df["Target"].value_counts())


Target
0    700
1    300
Name: count, dtype: int64
