> # Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
results_list = []


> # Random Forest

### *Model no.1*

In [3]:


#  Data loading
train_data = pd.read_csv(r'd:\DaneshKar\Project_Eden\Clean_Loan.csv')
test_data = pd.read_csv(r'd:\DaneshKar\Project_Eden\Clean_Test.csv')

#  Data preprocessing (handling missing values, scaling features)
train_data = train_data.dropna()  # حذف مقادیر گمشده برای ساده‌سازی
X = train_data.drop('Loan_Status', axis=1)  # انتخاب ویژگی‌ها
y = train_data['Loan_Status']  # انتخاب هدف (loan_status)

#  Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#  Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#  Model - Random Forest
model = RandomForestClassifier(random_state=42, class_weight='balanced')  # استفاده از class_weight برای توازن داده‌ها
model.fit(X_train, y_train)

#  Predictions
y_pred = model.predict(X_test)

#  Evaluation
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("F1 Score: ", f1)
print("Confusion Matrix:\n", conf_matrix)

# بررسی تعداد false positives (FP) و false negatives (FN)
fp = conf_matrix[0, 1]  # تعداد false positive
fn = conf_matrix[1, 0]  # تعداد false negative
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")


F1 Score:  0.8426966292134831
Confusion Matrix:
 [[17 19]
 [ 9 75]]
False Positives (FP): 19
False Negatives (FN): 9


### *Model No.2 : Rf with threshold for decreasing FalsePositive*

In [4]:
import numpy as np

#  Getting predicted probabilities instead of discrete labels
y_prob = model.predict_proba(X_test)[:, 1]  # Probability for the positive class (loan_status=1)

#  Define new threshold (e.g. 0.7)
threshold = 0.7
y_pred_new = (y_prob >= threshold).astype(int)  # Applying the new threshold

#  Evaluation with new threshold
f1_new = f1_score(y_test, y_pred_new)
conf_matrix_new = confusion_matrix(y_test, y_pred_new)

print("New F1 Score: ", f1_new)
print("New Confusion Matrix:\n", conf_matrix_new)

# Checking False Positives and False Negatives again
fp_new = conf_matrix_new[0, 1]  # False Positives (FP)
fn_new = conf_matrix_new[1, 0]  # False Negatives (FN)
print(f"False Positives (FP) with new threshold: {fp_new}")
print(f"False Negatives (FN) with new threshold: {fn_new}")


New F1 Score:  0.7975460122699386
New Confusion Matrix:
 [[22 14]
 [19 65]]
False Positives (FP) with new threshold: 14
False Negatives (FN) with new threshold: 19


### *Model No.3 : Rf with balance data*

In [5]:


# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE to balance the data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train the model again with the balanced data
model_smote = RandomForestClassifier(random_state=42)
model_smote.fit(X_train_smote, y_train_smote)

#  Predictions
y_pred_smote = model_smote.predict(X_test)

# Evaluation with SMOTE balance
f1_smote = f1_score(y_test, y_pred_smote)
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote)

print("F1 Score with SMOTE: ", f1_smote)
print("Confusion Matrix with SMOTE:\n", conf_matrix_smote)

# بررسی تعداد false positives (FP) و false negatives (FN)
fp_smote = conf_matrix_smote[0, 1]
fn_smote = conf_matrix_smote[1, 0]
print(f"False Positives (FP) with SMOTE: {fp_smote}")
print(f"False Negatives (FN) with SMOTE: {fn_smote}")



F1 Score with SMOTE:  0.8372093023255814
Confusion Matrix with SMOTE:
 [[20 16]
 [12 72]]
False Positives (FP) with SMOTE: 16
False Negatives (FN) with SMOTE: 12


### *Model No.4 : Rf with threshold and balance data*

In [6]:
#  Getting predicted probabilities instead of discrete labels
y_prob_new = model_smote.predict_proba(X_test)[:, 1]  # Probability for the positive class (loan_status=1)

#  Define new threshold (e.g. 0.6)
threshold = 0.6
y_pred_smote_new = (y_prob_new >= threshold).astype(int)  # Applying the new threshold

#  Evaluation with new threshold
f1_smote_new = f1_score(y_test, y_pred_smote_new)
conf_matrix_smote_new = confusion_matrix(y_test, y_pred_smote_new)

print("New F1 Score: ", f1_smote_new)
print("New Confusion Matrix:\n", conf_matrix_smote_new)

# Checking False Positives and False Negatives again
fp_new = conf_matrix_smote_new[0, 1]  # False Positives (FP)
fn_new = conf_matrix_smote_new[1, 0]  # False Negatives (FN)
print(f"False Positives (FP) with new threshold: {fp_new}")
print(f"False Negatives (FN) with new threshold: {fn_new}")


New F1 Score:  0.8
New Confusion Matrix:
 [[21 15]
 [18 66]]
False Positives (FP) with new threshold: 15
False Negatives (FN) with new threshold: 18


### *calculate error percentage*  

- Rf Model No.1

In [7]:
accuracy = accuracy_score(y_test, y_pred)
error_percentage1 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage1, "%")

Percentage of Error: 23.33333333333333 %


- Rf Model No.2

In [8]:
accuracy = accuracy_score(y_test, y_pred_new)
error_percentage2 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage2, "%")

Percentage of Error: 27.500000000000004 %


- Rf Model No.3

In [9]:
accuracy = accuracy_score(y_test, y_pred_smote)
error_percentage3 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage3, "%")

Percentage of Error: 23.33333333333333 %


- Rf Model No.4

In [10]:
accuracy = accuracy_score(y_test, y_pred_smote_new)
error_percentage4 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage4, "%")

Percentage of Error: 27.500000000000004 %


# RandomForest Model Evaluation and Selection

Based on the provided criteria (**F1 Score**, **Percentage of Error**, and minimizing **False Positives (FP)**), here's an analysis of the four models:

## Summary of Metrics

| **Model**  | **F1 Score** | **FP** | **FN** | **Error %**  |
|------------|--------------|---------|---------|--------------|
| **Model 1** | 0.8427       | 19      | 9       | 23.33        |
| **Model 2** | 0.7975       | 14      | 19      | 27.50        |
| **Model 3** | 0.8372       | 16      | 12      | 23.33        |
| **Model 4** | 0.8000       | 15      | 18      | 27.50        |

---

## Analysis

### Model 1
- **F1 Score**: The highest (0.8427), indicating an excellent balance between Precision and Recall.
- **Error %**: Good (23.33%).
- **FP**: The highest among all models (19), which is unfavorable for minimizing false positives.

### Model 2
- **FP**: The lowest (14), aligning with the goal of minimizing false positives.
- **F1 Score**: The lowest (0.7975), showing weaker Precision and Recall balance compared to other models.
- **Error %**: The highest (27.50%), which is a disadvantage.

### Model 3
- **F1 Score**: Near the top (0.8372, slightly below Model 1) and significantly better than Models 2 and 4.
- **Error %**: Matches Model 1 (23.33%) and is better than Models 2 and 4.
- **FP**: Lower than Model 1 (16 vs. 19) but slightly higher than Model 2 and Model 4.

### Model 4
- **FP**: Moderate (15), lower than Models 1 and 3 but slightly higher than Model 2.
- **F1 Score**: Better than Model 2 (0.8000 vs. 0.7975) but falls behind Models 1 and 3.
- **Error %**: Same as Model 2 (27.50%), which is higher than Models 1 and 3.

---

## Best Model Selection

### Recommendation:
- **Model 3** remains the best option based on:
  - A strong balance between **F1 Score** (0.8372) and **Error %** (23.33%).
  - Moderately low **FP** (16), which improves upon Model 1's performance while maintaining a strong overall metric balance.

### Alternative:
- If minimizing **FP** is the highest priority, **Model 2** can be considered due to its lowest **FP** (14). However, the drop in **F1 Score** and higher **Error %** make it less favorable for a balanced solution.

---



In [11]:
results_list.append({
    'Random Forest Error Rate (%)': error_percentage3,
    'Random Forest f1-score (%)': f1_smote,
    'Random Forest FP (%)' : "16"
})    

> # XGBoost

### *Model No.1*

In [12]:
import xgboost as xgb
from sklearn.metrics import f1_score, confusion_matrix

# Calculate the number of samples per class
unique_classes, class_counts = np.unique(y_train, return_counts=True)

# Train an XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, scale_pos_weight=class_counts[0] / class_counts[1])
xgb_model.fit(X_train, y_train)

#  Predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation
f1_xgb = f1_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

print("F1 Score with XGBoost: ", f1_xgb)
print("Confusion Matrix with XGBoost:\n", conf_matrix_xgb)

# False Positives and False Negatives
fp_xgb = conf_matrix_xgb[0, 1]
fn_xgb = conf_matrix_xgb[1, 0]
print(f"False Positives (FP) with XGBoost: {fp_xgb}")
print(f"False Negatives (FN) with XGBoost: {fn_xgb}")


F1 Score with XGBoost:  0.8235294117647058
Confusion Matrix with XGBoost:
 [[20 16]
 [14 70]]
False Positives (FP) with XGBoost: 16
False Negatives (FN) with XGBoost: 14


In [13]:
accuracy = accuracy_score(y_test, y_pred_xgb)
error_percentage_xgb1 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_xgb1, "%")

Percentage of Error: 25.0 %


### *Model No.2 : XGBoost with balance data*

In [14]:
import xgboost as xgb
from sklearn.metrics import f1_score, confusion_matrix

# Train an XGBoost model
xgb_model2 = xgb.XGBClassifier(random_state=42, eval_metric='auc')
xgb_model2.fit(X_train_smote, y_train_smote)

# Predictions
y_pred_xgb2 = xgb_model2.predict(X_test)

# Evaluation
f1_xgb_balance = f1_score(y_test, y_pred_xgb2)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb2)

print("F1 Score with XGBoost: ", f1_xgb_balance)
print("Confusion Matrix with XGBoost:\n", conf_matrix_xgb)

# False Positives and False Negatives
fp_xgb = conf_matrix_xgb[0, 1]
fn_xgb = conf_matrix_xgb[1, 0]
print(f"False Positives (FP) with XGBoost: {fp_xgb}")
print(f"False Negatives (FN) with XGBoost: {fn_xgb}")


F1 Score with XGBoost:  0.7951807228915663
Confusion Matrix with XGBoost:
 [[20 16]
 [18 66]]
False Positives (FP) with XGBoost: 16
False Negatives (FN) with XGBoost: 18


In [15]:
accuracy = accuracy_score(y_test, y_pred_xgb2)
error_percentage_xgb2 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_xgb2, "%")

Percentage of Error: 28.333333333333332 %


# XGBOOST Model Evaluation and Selection

Based on the provided criteria (**F1 Score**, **Percentage of Error**, and minimizing **False Positives (FP)**), here is an analysis of the two models.

---

## Summary of Metrics

| **Model**  | **F1 Score** | **FP** | **FN** | **Error %** |
|------------|--------------|---------|---------|-------------|
| **Model 1** | 0.8235       | 16      | 14      | 25.0        |
| **Model 2** | 0.7952       | 16      | 18      | 28.33       |

---

## Analysis

### **Model 1**:
1. **F1 Score**:
   - The highest (0.8235), indicating a better balance between Precision and Recall.
2. **Error %**:
   - Lower (25.0%) compared to Model 2, indicating higher accuracy.
3. **FP**:
   - Equal to Model 2 (16).
4. **FN**:
   - Lower (14 vs. 18), reflecting better Recall performance.

---

### **Model 2**:
1. **F1 Score**:
   - Lower than Model 1 (0.7952 vs. 0.8235), showing weaker Precision and Recall balance.
2. **Error %**:
   - Higher than Model 1 (28.33% vs. 25.0%).
3. **FP**:
   - Equal to Model 1 (16).
4. **FN**:
   - Higher (18 vs. 14), indicating a drop in Recall.

---

## Conclusion and Recommendation:
- **Model 1** is the best choice because:
  - It has the highest **F1 Score** (0.8235 vs. 0.7952).
  - It has a lower **Error Percentage** (25.0% vs. 28.33%).
  - It has fewer **False Negatives (FN)**, resulting in better Recall.

Since the goal is to **minimize False Positives (FP)**, and both models have the same FP value (16), **Model 1** is the recommended choice due to its overall better performance.

---

### Note:
If further optimization for **reducing FP** is required, the decision threshold for **Model 1** can be adjusted to improve this metric further.


In [16]:
results_list.append({
    'XGBOOST Error Rate (%)': error_percentage_xgb1,
    'XGBOOST f1-score (%)': f1_xgb,
    'XGBOOST FP (%)' : "16"
})    

> # LogesticRegression

### *Model No.1*

In [17]:
from sklearn.linear_model import LogisticRegression
log_reg_model = LogisticRegression(random_state=42, class_weight='balanced')
log_reg_model.fit(X_train, y_train)

# Predictions
y_pred_log_reg = log_reg_model.predict(X_test)  # پیش‌بینی نتایج

# Evaluation
f1_log_reg = f1_score(y_test, y_pred_log_reg)  # محاسبه F1 Score
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)  # محاسبه Confusion Matrix

print("F1 Score (Logistic Regression): ", f1_log_reg)  # نمایش F1 Score
print("Confusion Matrix (Logistic Regression):\n", conf_matrix_log_reg)  # نمایش ماتریس گیجی

# بررسی تعداد false positives (FP) و false negatives (FN)
fp_log_reg = conf_matrix_log_reg[0, 1]  # تعداد false positive
fn_log_reg = conf_matrix_log_reg[1, 0]  # تعداد false negative
print(f"False Positives (FP) (Logistic Regression): {fp_log_reg}")
print(f"False Negatives (FN) (Logistic Regression): {fn_log_reg}")

F1 Score (Logistic Regression):  0.861878453038674
Confusion Matrix (Logistic Regression):
 [[17 19]
 [ 6 78]]
False Positives (FP) (Logistic Regression): 19
False Negatives (FN) (Logistic Regression): 6


In [18]:
accuracy = accuracy_score(y_test, y_pred_log_reg)
error_percentage_log_reg = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_log_reg, "%")

Percentage of Error: 20.833333333333336 %


### *Model No.2 : LogesticRegression with balance data*

In [19]:


log_reg_model2 = LogisticRegression(random_state=42, class_weight='balanced')
log_reg_model2.fit(X_train_smote, y_train_smote)

# Predictions
y_pred_log_reg = log_reg_model2.predict(X_test)  # پیش‌بینی نتایج

# Evaluation
f1_log_reg2 = f1_score(y_test, y_pred_log_reg)  # محاسبه F1 Score
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)  # محاسبه Confusion Matrix

print("F1 Score (Logistic Regression): ", f1_log_reg2)  # نمایش F1 Score
print("Confusion Matrix (Logistic Regression):\n", conf_matrix_log_reg)  # نمایش ماتریس گیجی

# بررسی تعداد false positives (FP) و false negatives (FN)
fp_log_reg = conf_matrix_log_reg[0, 1]  # تعداد false positive
fn_log_reg = conf_matrix_log_reg[1, 0]  # تعداد false negative
print(f"False Positives (FP) (Logistic Regression): {fp_log_reg}")
print(f"False Negatives (FN) (Logistic Regression): {fn_log_reg}")

F1 Score (Logistic Regression):  0.8461538461538461
Confusion Matrix (Logistic Regression):
 [[15 21]
 [ 7 77]]
False Positives (FP) (Logistic Regression): 21
False Negatives (FN) (Logistic Regression): 7


In [20]:
accuracy = accuracy_score(y_test, y_pred_log_reg)
error_percentage_log_reg2 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_log_reg2, "%")

Percentage of Error: 23.33333333333333 %


# LogesticRegression Model Evaluation and Selection

Based on the provided criteria (**F1 Score**, **Percentage of Error**, and minimizing **False Positives (FP)**), here is the analysis of the two models.

---

## Summary of Metrics

| **Model**  | **F1 Score** | **FP** | **FN** | **Error %** |
|------------|--------------|---------|---------|-------------|
| **Model 1** | 0.8619       | 19      | 6       | 20.83       |
| **Model 2** | 0.8462       | 21      | 7       | 23.33       |

---

## Analysis

### **Model 1**:
1. **F1 Score**:
   - The highest (0.8619), indicating better overall performance.
2. **Error %**:
   - Lower (20.83%) compared to Model 2, indicating better accuracy.
3. **FP**:
   - Slightly better than Model 2 (19 vs. 21).
4. **FN**:
   - Lower (6 vs. 7), leading to better Recall.

---

### **Model 2**:
1. **F1 Score**:
   - Lower than Model 1 (0.8462 vs. 0.8619), reflecting weaker Precision and Recall balance.
2. **Error %**:
   - Higher than Model 1 (23.33% vs. 20.83%).
3. **FP**:
   - Higher than Model 1 (21 vs. 19).
4. **FN**:
   - Slightly higher (7 vs. 6), resulting in slightly worse Recall.

---

## Conclusion and Recommendation:
- **Model 1** is the best choice because:
  - It has the highest **F1 Score** (0.8619 vs. 0.8462).
  - It has a lower **Error Percentage** (20.83% vs. 23.33%).
  - It has fewer **False Positives (FP)** (19 vs. 21).
  - It has fewer **False Negatives (FN)** (6 vs. 7).

Model 1 performs better overall, especially in minimizing both False Positives and False Negatives.

---

### Note:
If additional tuning for reducing **False Positives (FP)** is required, adjusting the model’s decision threshold may yield further improvements.


In [21]:
results_list.append({
    'Log_reg Error Rate (%)': error_percentage_log_reg,
    'Log_reg f1_score (%)': f1_log_reg,
    'Log_reg FP (%)' : "19"
})    

> # KNN

### *Model No.1*

In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=27)  # مقداردهی اولیه KNN
knn_model.fit(X_train, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test)  # پیش‌بینی نتایج

# Evaluation
f1_knn = f1_score(y_test, y_pred_knn)  # محاسبه F1 Score
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)  # محاسبه Confusion Matrix

print("F1 Score (KNN): ", f1_knn)  # نمایش F1 Score
print("Confusion Matrix (KNN):\n", conf_matrix_knn)  # نمایش ماتریس گیجی

# بررسی تعداد false positives (FP) و false negatives (FN)
fp_knn = conf_matrix_knn[0, 1]  # تعداد false positive
fn_knn = conf_matrix_knn[1, 0]  # تعداد false negative
print(f"False Positives (FP) (KNN): {fp_knn}")
print(f"False Negatives (FN) (KNN): {fn_knn}")

F1 Score (KNN):  0.8723404255319149
Confusion Matrix (KNN):
 [[14 22]
 [ 2 82]]
False Positives (FP) (KNN): 22
False Negatives (FN) (KNN): 2


In [23]:
accuracy = accuracy_score(y_test, y_pred_knn)
error_percentage_knn = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_knn, "%")

Percentage of Error: 19.999999999999996 %


### *Model No.2 : KNN with balance data*

In [24]:
knn_model2 = KNeighborsClassifier(n_neighbors=16)  # مقداردهی اولیه KNN
knn_model2.fit(X_train_smote, y_train_smote)

# Predictions
y_pred_knn2 = knn_model2.predict(X_test)  # پیش‌بینی نتایج

# Evaluation
f1_knn2 = f1_score(y_test, y_pred_knn2)  # محاسبه F1 Score
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn2)  # محاسبه Confusion Matrix

print("F1 Score (KNN): ", f1_knn2)  # نمایش F1 Score
print("Confusion Matrix (KNN):\n", conf_matrix_knn)  # نمایش ماتریس گیجی

# بررسی تعداد false positives (FP) و false negatives (FN)
fp_knn = conf_matrix_knn[0, 1]  # تعداد false positive
fn_knn = conf_matrix_knn[1, 0]  # تعداد false negative
print(f"False Positives (FP) (KNN): {fp_knn}")
print(f"False Negatives (FN) (KNN): {fn_knn}")

F1 Score (KNN):  0.7636363636363637
Confusion Matrix (KNN):
 [[18 18]
 [21 63]]
False Positives (FP) (KNN): 18
False Negatives (FN) (KNN): 21


In [25]:
accuracy = accuracy_score(y_test, y_pred_knn2)
error_percentage_knn2 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_knn2, "%")

Percentage of Error: 32.49999999999999 %


### *Model No.3 : KNN with CV for best params* **params=best neighbors**

In [26]:
from sklearn.model_selection import train_test_split, GridSearchCV
knn_model3 = KNeighborsClassifier()

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': range(1, 21)}  # جستجو برای تعداد همسایه از 1 تا 20
grid_search = GridSearchCV(estimator=knn_model3, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best parameter (k)
best_k = grid_search.best_params_['n_neighbors']
print(f"Best number of neighbors (k): {best_k}")

# Predictions using the best model
best_knn_model = grid_search.best_estimator_  # مدل با بهترین k
y_pred_knn3 = best_knn_model.predict(X_test)  # پیش‌بینی نتایج

# Evaluation
f1_knn3 = f1_score(y_test, y_pred_knn3)  # محاسبه F1 Score
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn3)  # محاسبه Confusion Matrix

print("F1 Score (KNN with best k): ", f1_knn3)  # نمایش F1 Score
print("Confusion Matrix (KNN with best k):\n", conf_matrix_knn)  # نمایش ماتریس گیجی

# بررسی تعداد false positives (FP) و false negatives (FN)
fp_knn = conf_matrix_knn[0, 1]  # تعداد false positive
fn_knn = conf_matrix_knn[1, 0]  # تعداد false negative
print(f"False Positives (FP) (KNN with best k): {fp_knn}")
print(f"False Negatives (FN) (KNN with best k): {fn_knn}")

Best number of neighbors (k): 16
F1 Score (KNN with best k):  0.8663101604278075
Confusion Matrix (KNN with best k):
 [[14 22]
 [ 3 81]]
False Positives (FP) (KNN with best k): 22
False Negatives (FN) (KNN with best k): 3


In [27]:
accuracy = accuracy_score(y_test, y_pred_knn3)
error_percentage_knn3 = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_knn3, "%")

Percentage of Error: 20.833333333333336 %


# KNN Model Evaluation and Selection

Based on the provided criteria (**F1 Score**, **Percentage of Error**, and minimizing **False Positives (FP)**), here is the analysis of the three models.

---

## Summary of Metrics

| **Model**  | **F1 Score** | **FP** | **FN** | **Error %** |
|------------|--------------|---------|---------|-------------|
| **Model 1** | 0.8723       | 22      | 2       | 20.00       |
| **Model 2** | 0.7636       | 18      | 21      | 32.50       |
| **Model 3** | 0.8663       | 22      | 3       | 20.83       |

---

## Analysis

### **Model 1**:
1. **F1 Score**:
   - The highest (0.8723), indicating the best overall performance in balancing Precision and Recall.
2. **Error %**:
   - The lowest (20.00%), reflecting better accuracy compared to Model 2 and Model 3.
3. **FP**:
   - Slightly higher (22 vs. 18), compared to Model 2 but equal to Model 3.
4. **FN**:
   - The lowest (2 vs. 21 in Model 2, 3 in Model 3), indicating better Recall.

---

### **Model 2**:
1. **F1 Score**:
   - The lowest (0.7636), indicating a weaker balance between Precision and Recall.
2. **Error %**:
   - The highest (32.50%), meaning lower overall accuracy compared to Models 1 and 3.
3. **FP**:
   - The lowest (18 vs. 22 in Models 1 and 3), which may be preferred when aiming to minimize False Positives.
4. **FN**:
   - The highest (21 vs. 2 in Model 1 and 3), leading to much poorer Recall.

---

### **Model 3**:
1. **F1 Score**:
   - Slightly lower than Model 1 (0.8663 vs. 0.8723), but still high and a balanced performance between Precision and Recall.
2. **Error %**:
   - Slightly higher (20.83% vs. 20.00% in Model 1), but still a strong performance.
3. **FP**:
   - Equal to Model 1 (22).
4. **FN**:
   - Slightly higher (3 vs. 2 in Model 1), but still much better than Model 2.

---

## Conclusion and Recommendation:
- **Model 1** is the best choice because:
  - It has the highest **F1 Score** (0.8723), indicating the best balance between Precision and Recall.
  - It has the lowest **Error Percentage** (20.00%), reflecting the highest accuracy.
  - Although it has slightly more **False Positives (FP)** compared to Model 2, it has significantly fewer **False Negatives (FN)**, making it preferable overall.

---

### Note:
If further optimization for minimizing **False Positives (FP)** is required, you may experiment with the decision threshold or hyperparameter tuning. However, based on overall performance, **Model 1** is the best option.


In [28]:
results_list.append({
    'Log_reg Error Rate (%)': error_percentage_log_reg,
    'Log_reg f1_score (%)': f1_log_reg,
    'Log_reg FP (%)' : "22"
})    

> # ANN

In [29]:
# Libraries

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import keras
keras.utils.set_random_seed(42)
# Define the ANN model
model_ann = Sequential()
model_ann.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model_ann.add(Dropout(0.3))
model_ann.add(Dense(32, activation='relu'))
model_ann.add(Dropout(0.3))
model_ann.add(Dense(16, activation='relu'))
model_ann.add(Dropout(0.2))
model_ann.add(Dense(8, activation='relu'))
model_ann.add(Dropout(0.2))
model_ann.add(Dense(1, activation='sigmoid'))

model_ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early Stopping برای جلوگیری از Overfitting
early_stopping = EarlyStopping(monitor='accuracy', patience=10, restore_best_weights=True)

# ReduceLROnPlateau برای کاهش نرخ یادگیری به صورت پویا
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.00001)

#  Train the model
history = model_ann.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2,callbacks=[early_stopping, lr_scheduler], verbose=1)

# Predictions and threshold adjustment
y_pred_ann = (model_ann.predict(X_test) > 0.5).astype('int32')

#  Evaluation
f1_ann = f1_score(y_test, y_pred_ann)
conf_matrix_ann = confusion_matrix(y_test, y_pred_ann)

print("F1 Score (ANN):", f1_ann)
print("Confusion Matrix (ANN):\n", conf_matrix_ann)

fp_ann = conf_matrix_ann[0, 1]
fn_ann = conf_matrix_ann[1, 0]
print(f"False Positives (FP) (ANN): {fp_ann}")
print(f"False Negatives (FN) (ANN): {fn_ann}")


Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.6077 - loss: 0.6876 - val_accuracy: 0.6354 - val_loss: 0.6913 - learning_rate: 0.0010
Epoch 2/200
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7047 - loss: 0.6761 - val_accuracy: 0.6146 - val_loss: 0.6849 - learning_rate: 0.0010
Epoch 3/200
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7019 - loss: 0.6599 - val_accuracy: 0.6146 - val_loss: 0.6777 - learning_rate: 0.0010
Epoch 4/200
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6951 - loss: 0.6400 - val_accuracy: 0.6146 - val_loss: 0.6692 - learning_rate: 0.0010
Epoch 5/200
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7240 - loss: 0.6293 - val_accuracy: 0.6146 - val_loss: 0.6598 - learning_rate: 0.0010
Epoch 6/200
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step -

In [30]:
print("F1 Score (ANN):", f1_ann)

F1 Score (ANN): 0.8756756756756757


In [31]:
accuracy = accuracy_score(y_test, y_pred_ann)
error_percentage_ann = 100 * (1 - accuracy)
print("Percentage of Error:", error_percentage_ann, "%")

Percentage of Error: 19.166666666666664 %


In [214]:
results_list.append({
    'Ann Error Rate (%)': error_percentage_ann,
    'Ann f1_score  (%)': f1_ann,
    'Ann FP (%)' : "20"
})    

In [215]:
# Combine results into a DataFrame.


results_list

[{'Random Forest Error Rate (%)': 23.33333333333333,
  'Random Forest f1-score (%)': 0.8372093023255814,
  'Random Forest FP (%)': '16'},
 {'XGBOOST Error Rate (%)': 25.0,
  'XGBOOST f1-score (%)': 0.8235294117647058,
  'XGBOOST FP (%)': '16'},
 {'Log_reg Error Rate (%)': 20.833333333333336,
  'Log_reg f1_score (%)': 0.861878453038674,
  'Log_reg FP (%)': '19'},
 {'Log_reg Error Rate (%)': 20.833333333333336,
  'Log_reg f1_score (%)': 0.861878453038674,
  'Log_reg FP (%)': '22'},
 {'Ann Error Rate (%)': 19.166666666666664,
  'Ann f1_score  (%)': 0.8756756756756757,
  'Ann FP (%)': '20'}]

# Overall Model Evaluation and Recommendation

### Summary of Model Metrics:

| **Model**      | **Error Rate (%)** | **F1 Score (%)** | **False Positives (FP)** |
|----------------|--------------------|------------------|--------------------------|
| **Random Forest** | 23.33             | 83.72            | 16                       |
| **XGBoost**      | 25.00             | 82.35            | 16                       |
| **Logistic Regression (1)**  | 20.83             | 86.19            | 19                       |
| **Logistic Regression (2)**  | 20.83             | 86.19            | 22                       |
| **ANN**           | 19.17             | 87.57            | 20                       |

---

## Analysis and Conclusion:

### Key Insights:
1. **Best F1 Score:**
   - **ANN** has the highest F1 score at **87.57%**, indicating excellent balance between Precision and Recall.

2. **Best Error Rate:**
   - **ANN** also has the lowest **error rate** at **19.17%**, showing superior accuracy compared to all other models.

3. **False Positives:**
   - **XGBoost** and **Random Forest** have the **lowest False Positives (FP)** at **16**, which is preferable in scenarios where minimizing false alarms is critical.
   - However, they come with a **higher error rate** and slightly lower F1 scores compared to **ANN**.

4. **Logistic Regression Performance:**
   - The two Logistic Regression models perform similarly, with an F1 score of **86.19%** and an error rate of **20.83%**, but they have relatively higher False Positives compared to the other models, especially with **22 FP** in the second logistic regression model.

---

## Final Recommendation:
- **ANN** is the best overall model based on both **F1 Score** and **Error Rate**. Although its False Positives are slightly higher (**20 FP**) compared to **XGBoost** and **Random Forest**, its high accuracy and F1 score make it the strongest candidate.
- If minimizing **False Positives (FP)** is the absolute priority, **Random Forest** and **XGBoost** are preferable due to their lowest FP values (**16**).
- However, **ANN** provides the best tradeoff between accuracy, F1 score, and error rate, making it the recommended model if you can tolerate slightly higher False Positives.


> # Output as CSV
     Predicted based on best of the best

In [221]:
# پیش‌بینی‌های احتمال از مدل ANN
y_test_pred_prob = model_ann.predict(X_test_scaled)

# تبدیل پیش‌بینی‌ها به صفر و یک با استفاده از آستانه 0.5
y_test_pred_binary = (y_test_pred_prob > 0.5).astype(int)

# اضافه کردن پیش‌بینی‌های باینری به داده‌های تست
test_data['Loan_Status_pred'] = y_test_pred_binary  # پیش‌بینی‌های باینری در ستون جدید

# ذخیره داده‌های به‌روزرسانی شده در یک فایل جدید
test_data.to_csv(r'd:\DaneshKar\Project_Eden\Clean_Test_Updated_best.csv', index=False)



[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
   Dependents  Education  Credit_History  Property_Area  Gender_Male  \
0           0          1             1.0              2            1   
1           1          1             1.0              2            1   
2           2          1             1.0              2            1   
3           2          1             1.0              2            1   
4           0          0             1.0              2            1   

   Married_Yes  Self_Employed_Yes  Log_ApplicantIncome  Log_CoapplicantIncome  \
0            1                  0             8.651899               0.000000   
1            1                  0             8.031710               7.313887   
2            1                  0             8.517393               7.496097   
3            1                  0             7.758333               7.842671   
4            0                  0             8.094684               0.000000   

   Log

In [222]:
test_data.head(20)

Unnamed: 0,Dependents,Education,Credit_History,Property_Area,Gender_Male,Married_Yes,Self_Employed_Yes,Log_ApplicantIncome,Log_CoapplicantIncome,Log_LoanAmount,BoxCox_Loan_Amount_Term,Loan_Status_pred
0,0,1,1.0,2,1,1,0,8.651899,0.0,4.70953,17572780.0,1
1,1,1,1.0,2,1,1,0,8.03171,7.313887,4.844187,17572780.0,1
2,2,1,1.0,2,1,1,0,8.517393,7.496097,5.342334,17572780.0,1
3,2,1,1.0,2,1,1,0,7.758333,7.842671,4.615121,17572780.0,1
4,0,0,1.0,2,1,0,0,8.094684,0.0,4.369448,17572780.0,1
5,0,0,1.0,2,1,1,1,7.680637,8.138273,5.030438,17572780.0,1
6,1,0,1.0,1,0,0,0,7.708411,0.0,4.094345,17572780.0,1
7,2,0,0.0,0,1,1,0,8.264106,0.0,4.997212,17572780.0,0
8,2,1,1.0,2,1,1,0,9.520322,0.0,5.638355,5185310.0,1
9,0,0,1.0,1,1,0,0,7.783641,7.783641,4.820282,17572780.0,1
