In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Fiverr/sachinnad123/german_credit_data.csv")

data.head()

Unnamed: 0,Checking account,Duration,Credit history,Purpose,Credit amount,Savings account/bonds,Employment,Installment rate,Personal status,Other debtors / guarantors,...,Property,Age,Other installment plans,Housing,credits,Job,people liable,Telephone,foreign worker,credit risk status
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Checking account            1000 non-null   object
 1   Duration                    1000 non-null   int64 
 2   Credit history              1000 non-null   object
 3   Purpose                     1000 non-null   object
 4   Credit amount               1000 non-null   int64 
 5   Savings account/bonds       1000 non-null   object
 6   Employment                  1000 non-null   object
 7   Installment rate            1000 non-null   int64 
 8   Personal status             1000 non-null   object
 9   Other debtors / guarantors  1000 non-null   object
 10  Present residence since     1000 non-null   int64 
 11  Property                    1000 non-null   object
 12  Age                         1000 non-null   int64 
 13  Other installment plans     1000 non-null   objec

# Requirement R1: Data Pre-processing

In [14]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Impute missing values with mean/mode as appropriate
data.fillna(data.mean(), inplace=True)  # Replace missing numeric values with mean

# min-max normalization on numeric features
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

Missing values:
 Checking account              0
Duration                      0
Credit history                0
Purpose                       0
Credit amount                 0
Savings account/bonds         0
Employment                    0
Installment rate              0
Personal status               0
Other debtors / guarantors    0
Present residence since       0
Property                      0
Age                           0
Other installment plans       0
Housing                       0
credits                       0
Job                           0
people  liable                0
Telephone                     0
foreign worker                0
credit risk status            0
dtype: int64


  data.fillna(data.mean(), inplace=True)  # Replace missing numeric values with mean


In [15]:
import pandas as pd
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import LabelEncoder

# Separate features and target
X = data.drop(columns=['credit risk status'])
y = data['credit risk status']

# Encode categorical variables
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        X[column] = label_encoders[column].fit_transform(X[column])

# Identify categorical columns for SMOTENC
categorical_columns = [i for i, col in enumerate(X.columns) if X[col].dtype == 'int64']

# Apply SMOTENC
smote_nc = SMOTENC(categorical_features=categorical_columns, random_state=42)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

# Convert back to DataFrame
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)

# Decode categorical variables
for column in X_resampled_df.columns:
    if column in label_encoders:
        X_resampled_df[column] = label_encoders[column].inverse_transform(X_resampled_df[column])

# Check the shape of the resampled data
print("Shape of resampled data:", X_resampled_df.shape)

# Check the class distribution
print("Class distribution after SMOTENC:")
print(y_resampled.value_counts())


Shape of resampled data: (1400, 20)
Class distribution after SMOTENC:
0.0    700
1.0    700
Name: credit risk status, dtype: int64


In [16]:

# one-hot encoding on categorical features
# data = pd.get_dummies(data, drop_first=True)  # drop_first=True to avoid dummy variable trap

# Split data into features and target variable
# X = data.drop(columns=['credit risk status'])
# y = data['credit risk status']

# Split data into train/validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
data

Unnamed: 0,Duration,Credit amount,Installment rate,Present residence since,Age,credits,people liable,credit risk status,Checking account_A12,Checking account_A13,...,Property_A124,Other installment plans_A142,Other installment plans_A143,Housing_A152,Housing_A153,Job_A172,Job_A173,Job_A174,Telephone_A192,foreign worker_A202
0,0.029412,0.050567,1.000000,1.000000,0.857143,0.333333,0.0,0.0,0,0,...,0,0,1,1,0,0,1,0,1,0
1,0.647059,0.313690,0.333333,0.333333,0.053571,0.000000,0.0,1.0,1,0,...,0,0,1,1,0,0,1,0,0,0
2,0.117647,0.101574,0.333333,0.666667,0.535714,0.000000,1.0,0.0,0,0,...,0,0,1,1,0,1,0,0,0,0
3,0.558824,0.419941,0.333333,1.000000,0.464286,0.000000,1.0,0.0,0,0,...,0,0,1,0,1,0,1,0,0,0
4,0.294118,0.254209,0.666667,1.000000,0.607143,0.333333,1.0,1.0,0,0,...,1,0,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.117647,0.081765,0.666667,1.000000,0.214286,0.000000,0.0,0.0,0,0,...,0,0,1,1,0,1,0,0,0,0
996,0.382353,0.198470,1.000000,1.000000,0.375000,0.000000,0.0,0.0,0,0,...,0,0,1,1,0,0,0,1,1,0
997,0.117647,0.030483,1.000000,1.000000,0.339286,0.000000,0.0,0.0,0,0,...,0,0,1,1,0,0,1,0,0,0
998,0.602941,0.087763,1.000000,1.000000,0.071429,0.000000,0.0,1.0,0,0,...,1,0,1,0,1,0,1,0,1,0


In [5]:
data['credit risk status'].value_counts()

0.0    700
1.0    300
Name: credit risk status, dtype: int64

# Requirement R2:  Linear Perceptron Model

In [6]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import f1_score

# Train linear perceptron model
perceptron_model = Perceptron()
perceptron_model.fit(X_train_val, y_train_val)

# Predict on test set
y_pred = perceptron_model.predict(X_test)

# Calculate F1 score for each class
f1_scores = f1_score(y_test, y_pred, average=None)
print("F1 Score for each class:", f1_scores)


F1 Score for each class: [0.79850746 0.59090909]


# Apply SMOTENC

In [18]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Assuming X_resampled_df is the resampled DataFrame from the SMOTENC step

# Convert categorical variables to numerical format
label_encoders = {}
for column in X_resampled_df.columns:
    if X_resampled_df[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        X_resampled_df[column] = label_encoders[column].fit_transform(X_resampled_df[column])

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_df, y_resampled, test_size=0.2, random_state=42)

# Train linear perceptron model
perceptron_model = Perceptron()
perceptron_model.fit(X_train, y_train)

# Predict on test set
y_pred = perceptron_model.predict(X_test)

# Calculate F1 score for each class
f1_scores = f1_score(y_test, y_pred, average=None)
print("F1 Score for each class after SMOTENC and Perceptron model training:")
print(f1_scores)


F1 Score for each class after SMOTENC and Perceptron model training:
[0.6640625  0.71710526]


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 49 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Duration                         1000 non-null   float64
 1   Credit amount                    1000 non-null   float64
 2   Installment rate                 1000 non-null   float64
 3   Present residence since          1000 non-null   float64
 4   Age                              1000 non-null   float64
 5   credits                          1000 non-null   float64
 6   people  liable                   1000 non-null   float64
 7   credit risk status               1000 non-null   float64
 8   Checking account_A12             1000 non-null   uint8  
 9   Checking account_A13             1000 non-null   uint8  
 10  Checking account_A14             1000 non-null   uint8  
 11  Credit history_A31               1000 non-null   uint8  
 12  Credit history_A32   

# Requirement R3: Apply SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_val, y_train_val)

# Train linear perceptron model on oversampled dataset
perceptron_model_resampled = Perceptron()
perceptron_model_resampled.fit(X_train_resampled, y_train_resampled)

# Predict on test set
y_pred_resampled = perceptron_model_resampled.predict(X_test)

# Calculate F1 score for each class
f1_scores_resampled = f1_score(y_test, y_pred_resampled, average=None)
print("F1 Score for each class after SMOTE:", f1_scores_resampled)


F1 Score for each class after SMOTE: [0.83443709 0.48979592]


# Requirement R4: Compare F Values

F1 Score for original: [0.79850746 0.59090909]

F1 Score for after SMOTE: [0.83443709 0.48979592]

F1 Score has for one class and decrease for aother class when oversampling

# Requirement R5: Identify Least Significant Predictors

In [None]:
# Identify least significant predictors
coefficients = perceptron_model_resampled.coef_
feature_importance = pd.Series(coefficients[0], index=X_train_resampled.columns)
least_significant_predictors = feature_importance.abs().nsmallest(2).index

# Drop least significant predictors
X_train_resampled.drop(columns=least_significant_predictors, inplace=True)
X_test.drop(columns=least_significant_predictors, inplace=True)

# Retrain linear perceptron model
perceptron_model_updated = Perceptron()
perceptron_model_updated.fit(X_train_resampled, y_train_resampled)

# Predict on test set
y_pred_updated = perceptron_model_updated.predict(X_test)

# Calculate F1 score for each class
f1_scores_updated = f1_score(y_test, y_pred_updated, average=None)
print("F1 Score for each class after dropping least significant predictors:", f1_scores_updated)


F1 Score for each class after dropping least significant predictors: [0.80442804 0.58914729]


# Requirement R6: Identify Most Significant Predictors

In [None]:
# Identify most significant predictors
most_significant_predictors = feature_importance.abs().nlargest(2).index

# Display the most significant predictors
print("Most significant predictors:", most_significant_predictors)


Most significant predictors: Index(['Credit amount', 'Checking account_A14'], dtype='object')


These two have the highest feature importance value

# Requirement R7: Tune SMOTE Parameters

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# Define pipeline
pipeline = Pipeline([
    ('sampling', SMOTE()),
    ('perceptron', Perceptron())
])

# Define parameters for grid search
parameters = {
    'sampling__sampling_strategy': ['auto', 'minority', 'not minority'],
    'sampling__k_neighbors': [3, 5, 7]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, cv=3, scoring='f1_macro')
grid_search.fit(X_train_val, y_train_val)

# Display optimal combination of parameters
print("Optimal combination of (sampling strategy, k_neighbors):", grid_search.best_params_)

# Generate new perceptron model with optimal parameters
perceptron_model_tuned = grid_search.best_estimator_
perceptron_model_tuned.fit(X_train_resampled, y_train_resampled)

# Predict on test set
y_pred_tuned = perceptron_model_tuned.predict(X_test)

# Calculate F1 score for each class
f1_scores_tuned = f1_score(y_test, y_pred_tuned, average=None)
print("F1 Score for each class after tuning:", f1_scores_tuned)


Optimal combination of (sampling strategy, k_neighbors): {'sampling__k_neighbors': 3, 'sampling__sampling_strategy': 'auto'}
F1 Score for each class after tuning: [0.80442804 0.58914729]
