#### MIS 545 
##### German Credit History

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#### Loading the Dataset and Initial Check

In [4]:
# Load the dataset
credit_data = pd.read_csv(r'C:\Users\ual-laptop\Desktop\MIS\MIS 545\FinalProject\GermanCredit.csv')

# Display the first few rows of the dataframe
credit_data.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,... < 100 DM,6,critical account/other credits existing,domestic appliances,1169,unknown/no savings account,... >= 7 years,4,male : single,none,...,real estate,67,none,own,2,skilled employee/official,1,yes,yes,1
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,none,...,real estate,22,none,own,1,skilled employee/official,1,no,yes,0
2,no checking account,12,critical account/other credits existing,retraining,2096,... < 100 DM,4 <= ... < 7 years,2,male : single,none,...,real estate,49,none,own,1,unskilled - resident,2,no,yes,1
3,... < 100 DM,42,existing credits paid back duly till now,radio/television,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,guarantor,...,building society savings agreement/life insurance,45,none,for free,1,skilled employee/official,2,no,yes,1
4,... < 100 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,none,...,unknown/no property,53,none,for free,2,skilled employee/official,2,no,yes,0


#### Checking for Missing Values and Data Types

In [5]:
# Checking for missing values
missing_values = credit_data.isnull().sum()

# Checking data types
data_types = credit_data.dtypes

print(missing_values)
print(data_types)

status                     0
duration                   0
credit_history             0
purpose                    0
amount                     0
savings                    0
employment_duration        0
installment_rate           0
personal_status_sex        0
other_debtors              0
present_residence          0
property                   0
age                        0
other_installment_plans    0
housing                    0
number_credits             0
job                        0
people_liable              0
telephone                  0
foreign_worker             0
credit_risk                0
dtype: int64
status                     object
duration                    int64
credit_history             object
purpose                    object
amount                      int64
savings                    object
employment_duration        object
installment_rate            int64
personal_status_sex        object
other_debtors              object
present_residence           int64
prop

#### One-Hot Encoding of Categorical Variables

In [28]:
# Selecting categorical columns for one-hot encoding
categorical_columns = credit_data.select_dtypes(include=['object']).columns

# Applying one-hot encoding
encoder = OneHotEncoder(sparse=False) # Dense Array
encoded_data = encoder.fit_transform(credit_data[categorical_columns])

# Creating a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Dropping the original categorical columns and concatenating the new encoded columns
credit_data_cleaned = credit_data.drop(categorical_columns, axis=1)
credit_data_cleaned = pd.concat([credit_data_cleaned, encoded_df], axis=1)

print(encoded_df.columns)

Index(['status_... < 100 DM',
       'status_... >= 200 DM / salary for at least 1 year',
       'status_0 <= ... < 200 DM', 'status_no checking account',
       'credit_history_all credits at this bank paid back duly',
       'credit_history_critical account/other credits existing',
       'credit_history_delay in paying off in the past',
       'credit_history_existing credits paid back duly till now',
       'credit_history_no credits taken/all credits paid back duly',
       'purpose_business', 'purpose_car (new)', 'purpose_car (used)',
       'purpose_domestic appliances', 'purpose_education',
       'purpose_furniture/equipment', 'purpose_others',
       'purpose_radio/television', 'purpose_repairs', 'purpose_retraining',
       'savings_... < 100 DM', 'savings_... >= 1000 DM',
       'savings_100 <= ... < 500 DM', 'savings_500 <= ... < 1000 DM',
       'savings_unknown/no savings account',
       'employment_duration_... < 1 year',
       'employment_duration_... >= 7 years',
  



#### Handling Outliers

In [7]:
# Function to detect and handle outliers using IQR
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Handling outliers by capping
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Columns to check for outliers
numerical_columns = ['duration', 'amount', 'age']

# Applying outlier handling for each numerical column
for column in numerical_columns:
    credit_data_cleaned = handle_outliers(credit_data_cleaned, column)


In [8]:
# Saving the cleaned data to a CSV file
credit_data_cleaned.to_csv(r'C:\Users\ual-laptop\Desktop\MIS\MIS 545\FinalProject\Cleaned_GermanCredit.csv', index=False)

##### Data Preperation

In [9]:
# Load the cleaned data
cleaned_data = pd.read_csv(r'C:\Users\ual-laptop\Desktop\MIS\MIS 545\FinalProject\Cleaned_GermanCredit.csv')  # Replace with your file path

# Separating the features and the target variable
X = cleaned_data.drop('credit_risk', axis=1)  # Features
y = cleaned_data['credit_risk']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Logistic Regression

In [10]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
log_reg_pred = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)

#### K-Nearest Neighbors (KNN)

In [11]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)

#### Decision Tree

In [12]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_pred)

#### Linear Regression (Simple, with one feature)

##### Selecting a single feature for simple linear regression

In [13]:
X_linear = X_train[['amount']]  # Replace 'amount' with your chosen feature
X_test_linear = X_test[['amount']]

lin_reg = LinearRegression()
lin_reg.fit(X_linear, y_train)
lin_reg_pred = lin_reg.predict(X_test_linear)
lin_reg_mse = mean_squared_error(y_test, lin_reg_pred)

#### Multiple Regression (with all features)

In [14]:
multi_reg = LinearRegression()
multi_reg.fit(X_train, y_train)
multi_reg_pred = multi_reg.predict(X_test)
multi_reg_mse = mean_squared_error(y_test, multi_reg_pred)

#### Naive Bayes

In [15]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)

#### Random Forest

In [17]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
rf_pred = random_forest.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

In [22]:
# Neural network
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model
nn_model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
nn_accuracy = nn_model.evaluate(X_test, y_test)[1]




Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
