# Importing Libraries and Datasets

### Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score, accuracy_score
from imblearn.over_sampling import SMOTENC
from collections import Counter
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.over_sampling import SMOTENC
from sklearn.linear_model import Perceptron

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Importing the datasets

In [3]:
# assigning column names from dataset details
column_names = [
    'CheckingAccountStatus', 'Duration', 'CreditHistory', 'Purpose', 'CreditAmount',
    'SavingsAccountBonds', 'PresentEmploymentSince', 'InstallmentRatePercentage',
    'PersonalStatusSex', 'OtherDebtorsGuarantors', 'PresentResidenceSince',
    'Property', 'Age', 'OtherInstallmentPlans', 'Housing', 'NumberOfExistingCreditsAtBank',
    'Job', 'NumberOfPeopleLiable', 'Telephone', 'ForeignWorker', 'LoanApproval'
]

#identifying categorical columns for hot encoding
categorical_columns = [
    'CheckingAccountStatus', 'CreditHistory', 'Purpose', 'SavingsAccountBonds',
    'PresentEmploymentSince', 'PersonalStatusSex', 'OtherDebtorsGuarantors',
    'Property', 'OtherInstallmentPlans', 'Housing', 'Job', 'Telephone', 'ForeignWorker'
]

dataset = pd.read_csv('german_credit_data.csv', header = None, names = column_names) #column names

#X = dataset.iloc[:, :-1].values #all rows, EXCEPT last column for independent variable
#y = dataset.iloc[:, -1].values #all rows, and just last column for depent variable vector 

In [4]:
print(dataset.head())

  CheckingAccountStatus  Duration CreditHistory Purpose  CreditAmount  \
0                   A11         6           A34     A43          1169   
1                   A12        48           A32     A43          5951   
2                   A14        12           A34     A46          2096   
3                   A11        42           A32     A42          7882   
4                   A11        24           A33     A40          4870   

  SavingsAccountBonds PresentEmploymentSince  InstallmentRatePercentage  \
0                 A65                    A75                          4   
1                 A61                    A73                          2   
2                 A61                    A74                          2   
3                 A61                    A74                          2   
4                 A61                    A73                          3   

  PersonalStatusSex OtherDebtorsGuarantors  ...  Property Age  \
0               A93                   A101  .

### Encoding categorical data

In [5]:
# encoding categorical data

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_columns)], remainder='passthrough')

X = ct.fit_transform(dataset.drop(['LoanApproval'], axis=1))
y = dataset['LoanApproval'].values
print(X[1])

[0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 1.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
 0.000e+00 1.000e+00 0.000e+00 1.000e+00 0.000e+00 4.800e+01 5.951e+03
 2.000e+00 2.000e+00 2.200e+01 1.000e+00 1.000e+00]


### Feature Scaling

In [6]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Requirement 1

## Splitting the dataset into the Training, Test, and Validation sets 

In [7]:
#60/20/20 training split. train/validation/test
from sklearn.model_selection import train_test_split

#splitting twice for 60/20/20 split
X_leftover, X_test, y_leftover, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) #800 leftover 200 test
X_train, X_valid, y_train, y_valid = train_test_split(X_leftover, y_leftover, test_size = 0.25, random_state = 0) #25% valid, 75% test, 200, 600

In [8]:
#checking lengths just in case
print(len(X_train))
print(len(y_train))
print(len(X_valid))
print(len(y_valid))
print(len(X_test))
print(len(y_test))


600
600
200
200
200
200


## Taking Care of Misssing Data (R1.2)

### Checking for Missing Data

In [9]:
# checkl if there is any missing data
missing_values = dataset.isnull().sum()
print(missing_values)
# none was found!

CheckingAccountStatus            0
Duration                         0
CreditHistory                    0
Purpose                          0
CreditAmount                     0
SavingsAccountBonds              0
PresentEmploymentSince           0
InstallmentRatePercentage        0
PersonalStatusSex                0
OtherDebtorsGuarantors           0
PresentResidenceSince            0
Property                         0
Age                              0
OtherInstallmentPlans            0
Housing                          0
NumberOfExistingCreditsAtBank    0
Job                              0
NumberOfPeopleLiable             0
Telephone                        0
ForeignWorker                    0
LoanApproval                     0
dtype: int64


### Taking Care of Missing Data (none)

In [10]:
# none was found

# Requirement 2



### Generating Perceptron Model and Deploying on Test Data

In [11]:
# Create and train the Perceptron model on scaled data
perceptron = Perceptron(max_iter=1000, tol=1e-3, random_state=0)

perceptron.fit(X_train, y_train)
y_pred_valid = perceptron.predict(X_valid)

#printing out initial validation scores
print(f"Initial Validation F1 Score: {f1_score(y_valid, y_pred_valid, average='weighted')}")
f1_scores_per_class = f1_score(y_valid, y_pred_valid, average=None)
print("Inital Validation F1 Scores for Each Class:", f1_scores_per_class)


Initial Validation F1 Score: 0.7749197860962568
Inital Validation F1 Scores for Each Class: [0.82575758 0.66176471]


# Requirement 3

## Applying SMOTENC to Balance the Dataset

In [12]:
# finding indicies
categorical_feature_indices = [column_names.index(col) for col in categorical_columns]
print(categorical_feature_indices)

# balancing the dataset
smotenc = SMOTENC(categorical_features=categorical_feature_indices, random_state=42, sampling_strategy=1, k_neighbors=3)
X_train_resampled, y_train_resampled = smotenc.fit_resample(X_train, y_train)

#printing the amount for each class before and after balancing
print('original: ', Counter(y_train))
print('oversampled: ', Counter(y_train_resampled))

[0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19]
original:  Counter({1: 420, 2: 180})
oversampled:  Counter({1: 420, 2: 420})


## Generating NEW Linear Perceptron on the Oversampled Dataset

### Making New Perceptron Model and Displaying Results

In [13]:
# generate new linear perceptron model on oversampled data set
perceptron.fit(X_train_resampled, y_train_resampled)
y_pred_test = perceptron.predict(X_test)

#printing scores for each class post smotenc
print(f"Post-SMOTENC Test F1 Score: {f1_score(y_test, y_pred_test, average='weighted')}")
f1_scores_per_class = f1_score(y_test, y_pred_test, average=None)
print("Post-SMOTENC Test F1 Scores for Each Class:", f1_scores_per_class)

Post-SMOTENC Test F1 Score: 0.6648871527777778
Post-SMOTENC Test F1 Scores for Each Class: [0.7265625  0.51388889]


# Requirement 5

### My approach to finding the least significant features

At first: I decided to mimic the approach of the cortnory heart diease we discussed in class. I will use the pandas library to find the highly correlated features, and remove the least signifiant ones. As we saw in the heart diease example, just because a feature impacts the target variable, doesn't necessarily mean it is the most significant. This is due to the correlation between features. I could not find correlation, I might have done it wrong, but removal of features one by one did no change the F1 scores of the model. 

### Idenfitying the Least Significant Predictors

In [14]:

# slsect 20 for n features, then 20. Which ever dont show up are least sig predictors
selector = RFE(perceptron, n_features_to_select=18, step=1)
selector = selector.fit(X_train_resampled, y_train_resampled)

# transform the training and test sets to keep only the selected features
X_train_selected = selector.transform(X_train_resampled)
X_test_selected = selector.transform(X_test)

# new perceptron
perceptron.fit(X_train_selected, y_train_resampled)
y_pred_test = perceptron.predict(X_test_selected)

# print f1 scores here
print(f"F1 Score with Selected Features: {f1_score(y_test, y_pred_test, average='weighted')}")

# to see which features were selected
print(f"Mask of Selected Features: {selector.support_}")
print(f"Ranking of Features: {selector.ranking_}")

# to get the indices of selected features, if needed
selected_features_indices = [i for i, x in enumerate(selector.support_) if x]
print(f"Indices of Selected Features: {selected_features_indices}")


F1 Score with Selected Features: 0.6375311225023538
Mask of Selected Features: [False False  True  True  True False False False False False  True False
 False  True False False False  True  True  True  True  True  True  True
 False False False False False False False False False False False  True
 False False False False False False False False False  True False False
 False False False False False  True  True  True  True False False False
 False]
Ranking of Features: [37  3  1  1  1 40 41 35  5 34  1 17 19  1  6 26 31  1  1  1  1  1  1  1
 10 42 30  4 11 18 32 25 29 20 24  1  9 22 21 13 39 12 38 43  2  1 44 27
 23 15 36  8 28  1  1  1  1  7 14 16 33]
Indices of Selected Features: [2, 3, 4, 10, 13, 17, 18, 19, 20, 21, 22, 23, 35, 45, 53, 54, 55, 56]


### Describing why the two features are the least significant. For example, in the cortnory heart diesease, the obesity and smoking(?) were not significant because they were so highly correlated with the family history. What relations can you find here?

## Dropping Two Least Significant Features

### Dropping the features and displaying F values

In [19]:
# indices 42 qand 40

# drop the two least significant features: indices 40 and 42 (adjust for 0-based indexing if necessary)
X_train_resampled_dropped = np.delete(X_train_resampled, [40, 42], axis=1)
X_test_dropped = np.delete(X_test, [40, 42], axis=1)

# new perceptron
perceptron = Perceptron(max_iter=1000, tol=1e-3, random_state=0)
perceptron.fit(X_train_resampled_dropped, y_train_resampled)
y_pred = perceptron.predict(X_test_dropped)

#print f1 scores here
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score after dropping least significant features: {f1:.4f}")
f1_scores_per_class = f1_score(y_test, y_pred, average=None)
print("Test F1 Scores for Each Class After Dropping:", f1_scores_per_class)

F1 Score after dropping least significant features: 0.6800
Test F1 Scores for Each Class After Dropping: [0.77464789 0.44827586]


: 

# Requirement 6

### Identify the Two Most Significant Features

In [16]:
# using coefficient to find least significant predictors 
importance = np.abs(perceptron.coef_[0])

# get indices of the two features with the highest coefficients
top2_indices = np.argsort(importance)[-2:]

print(f"Indices of the two most significant predictors: {top2_indices}")
print(f"Coefficients of the two most significant predictors: {importance[top2_indices]}")


Indices of the two most significant predictors: [ 2 53]
Coefficients of the two most significant predictors: [12.        15.0123859]


### Describe the method, together with your reasoning as to why the 2 features are the most significant.

enter your reasoning here noe

# Requirement 7

## Tunning the Oversampling Ratio and n-values

### Using Grid Search on the Validation Set to find optimal tuning

In [17]:
print(Counter(y_train))

Counter({1: 420, 2: 180})


In [18]:
# setting up the pipline
pipeline = ImPipeline([
    ('smotenc', SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
    ('perceptron', Perceptron(max_iter=1000, tol=1e-3, random_state=0))
])

# setting neighbors for the search to try
param_grid = {
    'smotenc__sampling_strategy': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 1.0],  # Oversampling ratios to try
    'smotenc__k_neighbors': [1, 2, 3, 4, 5, 6, 7, 9, 10]  # Number of nearest neighbors
}

# dropping the valid least significant  features via indicies
X_train_dropped = np.delete(X_train, [40, 42], axis=1)
X_valid_dropped = np.delete(X_valid, [40, 42], axis=1)

# actually doing the grid search
grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_weighted', cv=5)
grid_search.fit(X_train_dropped, y_train)

print("Best parameters found: ", grid_search.best_params_)

# using best estimator to predict on validation set
best_estimator = grid_search.best_estimator_
y_pred_valid = best_estimator.predict(X_valid_dropped)

# printing f1 scores
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
f1_valid = f1_score(y_valid, y_pred_valid, average='weighted')
print(f"Validation F1 Score with GridSearch Optimized SMOTENC: {f1_valid:.4f}")

# priting f1 scores fopr both classes
f1_scores_per_class_valid = f1_score(y_valid, y_pred_valid, average=None)
print("Validation F1 Scores for Each Class with GridSearch Optimized SMOTENC:", f1_scores_per_class_valid)

Best parameters found:  {'smotenc__k_neighbors': 10, 'smotenc__sampling_strategy': 0.8}
Validation F1 Score with GridSearch Optimized SMOTENC: 0.7479
Validation F1 Scores for Each Class with GridSearch Optimized SMOTENC: [0.796875   0.63888889]


180 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\noetr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\noetr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\noetr\AppData