In [1]:
import pandas as pd

# Adjust the file name if needed
df = pd.read_csv("german_credit_data.csv")

# See the shape and preview
print(df.shape)
df.head()


(1000, 10)


Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car


In [3]:
# Drop the "Unnamed: 0" column (just row index)
df.drop(columns=['Unnamed: 0'], inplace=True)

# Check for nulls
print("Missing values:\n", df.isnull().sum())

# Fill missing values (basic imputation)
df['Saving accounts'].fillna('none', inplace=True)
df['Checking account'].fillna('none', inplace=True)

# Encode categorical columns
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

df.head()


Missing values:
 Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
dtype: int64


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,1,2,1,2,0,1169,6,5
1,22,0,2,1,0,1,5951,48,5
2,49,1,1,1,0,2,2096,12,3
3,45,1,2,0,0,0,7882,42,4
4,53,1,2,0,0,0,4870,24,1


In [5]:
# Simulate a target column 'Risk'
# Let's say high credit amount + long duration = bad risk
df['Risk'] = ((df['Credit amount'] > 5000) & (df['Duration'] > 24)).astype(int)

# 0 = Good Risk, 1 = Bad Risk

# Now split the dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = df.drop('Risk', axis=1)
y = df['Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[176   0]
 [  0  24]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       176
           1       1.00      1.00      1.00        24

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [7]:
#through XG Boost

from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Predict
xgb_pred = xgb_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))


[[176   0]
 [  0  24]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       176
           1       1.00      1.00      1.00        24

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
# Create a dataframe with predictions and original values
results_df = X_test.copy()
results_df['Actual Risk'] = y_test
results_df['Predicted Risk (XGB)'] = xgb_pred

# Reset index to clean up
results_df.reset_index(drop=True, inplace=True)

# Optionally, map risk values for readability (0 = Good, 1 = Bad)
results_df['Actual Risk'] = results_df['Actual Risk'].map({0: 'Good', 1: 'Bad'})
results_df['Predicted Risk (XGB)'] = results_df['Predicted Risk (XGB)'].map({0: 'Good', 1: 'Bad'})

# Export to CSV for Tableau
results_df.to_csv("xgb_credit_risk_results.csv", index=False)
print("✅ CSV file exported: xgb_credit_risk_results.csv")


✅ CSV file exported: xgb_credit_risk_results.csv


In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the encoded CSV file
df_encoded = pd.read_csv('xgb_credit_risk_results.csv')

# Load the original dataset used before encoding
df_original = pd.read_csv('german_credit_data.csv')  # Replace with your original filename

# Drop index column if present
if 'Unnamed: 0' in df_original.columns:
    df_original.drop(columns=['Unnamed: 0'], inplace=True)

# Identify categorical columns
cat_cols = df_original.select_dtypes(include='object').columns

# Step 1: Fit label encoders on the original data
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    le.fit(df_original[col])
    label_encoders[col] = le

# Step 2: Reverse encode categorical columns
for col in cat_cols:
    if col in df_encoded.columns:
        le = label_encoders[col]
        try:
            df_encoded[col] = le.inverse_transform(df_encoded[col])
        except:
            pass  # If already text, skip

# Step 3: Map Predicted Risk (XGB) back to 'good' and 'bad'
if 'Predicted Risk (XGB)' in df_encoded.columns:
    df_encoded['Predicted Risk (XGB)'] = df_encoded['Predicted Risk (XGB)'].map({0: 'good', 1: 'bad'})

# Step 4: Export the human-readable CSV
df_encoded.to_csv('xgb_credit_risk_results_human_readable.csv', index=False)
print("✅ Exported as 'xgb_credit_risk_results_human_readable.csv'")



✅ Exported as 'xgb_credit_risk_results_human_readable.csv'


In [17]:
import pandas as pd

# Adjust the file name if needed
dfd = pd.read_csv("xgb_credit_risk_results_human_readable.csv")

# See the shape and preview
print(dfd.shape)
dfd.head()

(200, 11)


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Actual Risk,Predicted Risk (XGB)
0,24,female,2,own,little,little,3190,18,radio/TV,Good,
1,35,male,1,own,moderate,little,4380,18,car,Good,
2,32,male,2,own,moderate,little,2325,24,car,Good,
3,23,male,2,rent,little,,1297,12,radio/TV,Good,
4,35,male,3,own,little,rich,7253,33,car,Bad,
