# Library Importation

In [1]:
#Importing necessary libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os,sys
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Data Importation

In [5]:
#Dataset Importation
ICA = pd.read_csv("loan_approval_dataset.csv")

# Initial Data Analysis

In [None]:
# Checking head of dataset (first 10 rows of the dataset).
ICA.head(n=10)

In [None]:
#displaying the shape
ICA.shape

In [None]:
#basic statistics of the dataset
ICA.describe()

In [None]:
# Checking for duplicates
ICA.duplicated().sum()

In [None]:
#finding unique values in the column
for i in ICA.columns:
    print("*******************************************************************",i,"************************************************************************")
    print()
    print(set(ICA[i].tolist()))
    print()

In [None]:
#Checking column names
ICA.columns

In [None]:
ICA.columns = ICA.columns.str.replace(' ', '')

In [None]:
#Checking column names
ICA.columns

In [None]:
# Checking value count of target variable
ICA["loan_status"].value_counts()

In [None]:
# Removing unnecessary column
ICA = ICA.drop(['loan_id'], axis=1)

In [None]:
ICA.info()

In [None]:
# Calculate IQR for each numerical column
Q1 = ICA[numerical_columns].quantile(0.25)
Q3 = ICA[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# Print the calculated IQR for each column
print("Interquartile Range (IQR):")
print(IQR)

# Define threshold for outliers
lower_threshold = Q1 - 1.5 * IQR
upper_threshold = Q3 + 1.5 * IQR

# Print the threshold values
print("\nLower Threshold for Outliers:")
print(lower_threshold)
print("\nUpper Threshold for Outliers:")
print(upper_threshold)


# Data Visualisation

In [None]:
sns.pairplot(ICA)
plt.show()

In [None]:
temp = ICA["loan_status"].value_counts()
temp_df = pd.DataFrame({"loan_status": temp.index, 'values': temp.values})
# Using seaborn to create a bar plot
sns.barplot(x='loan_status', y='values', data=temp_df)
plt.show()

In [None]:
# Remove leading whitespaces from column names
ICA.columns = [col.strip() for col in ICA.columns]


independent_attributes = ICA.drop(['loan_status'], axis=1)  # Exclude the target variable

# Select a subset of features (adjust as needed)
subset_features = ['no_of_dependents', 'income_annum', 'loan_amount', 'cibil_score']

# Add the target variable back for visualization
subset_data = pd.concat([independent_attributes[subset_features], ICA['loan_status']], axis=1)

# Create a pair plot
sns.pairplot(subset_data, hue='loan_status', markers=['o', 's'], palette='viridis')
plt.suptitle('Pair Plot of Selected Independent Attributes')
plt.show()


In [None]:
for feature in subset_features:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='loan_status', y=feature, data=ICA)
    plt.title(f'Box Plot of {feature} by Loan Status')
    plt.show()

In [None]:
#distribution of Numerical Columns(why?because i want my data to be normally distributed)
numerical_columns = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
                      'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']

# Plot histograms for each numerical column
for column in numerical_columns:
    plt.figure(figsize=(8, 6))
    sns.histplot(ICA[column], kde=True, bins=30, color='darkgreen')
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
numerical_columns = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
                      'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
# Plot box plots for each numerical column
for column in numerical_columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=ICA[column], color='darkgreen')
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()


In [None]:
numerical_columns = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
                      'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']

ICA[numerical_columns].hist(bins=20, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Features')
plt.show()


# Data Cleaning

In [None]:
# Feature Encoding
label_encoder = LabelEncoder()
ICA['self_employed'] = label_encoder.fit_transform(ICA['self_employed'])
ICA['education'] = label_encoder.fit_transform(ICA['education'])
ICA['loan_status'] = label_encoder.fit_transform(ICA['loan_status'])

In [None]:
# Showing head of dataset after encoding
ICA.head(n=10)

# Normalisation

In [None]:
# Select numerical features (excluding the target variable 'Class')
numerical_features = [col for col in ICA.columns if col != 'loan_status']
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Fit and transform the numerical features
ICA[numerical_features] = scaler.fit_transform(ICA[numerical_features])

In [None]:
#Checking head of dataset after normalisation
ICA.head(n=10)

# Correlation Analysis


In [None]:
# Select numerical features (excluding the target variable 'loan_status')
numerical_features = [col for col in ICA.columns if col != 'loan_status']
# Select numerical features from the DataFrame
numerical_data = ICA[numerical_features]
# Calculate the correlation matrix
corr_all = numerical_data.corr()
# Create a heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(corr_all, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()


In [None]:
# Display the correlation matrix
print(corr_all)

In [None]:
# Checking for highly Correlated Features
# Setting a threshold for high correlation
threshold = 0.7
# Initializing a list to store highly correlated features pairs
highly_correlated_pairs = []
# Iterating through the correlation matrix
for i in range(len(corr_all.columns)):
    for j in range(i+1, len(corr_all.columns)):
        if abs(corr_all.iloc[i, j]) > threshold:
            # Store the names of highly correlated features
            pair = (corr_all.columns[i], corr_all.columns[j])
            highly_correlated_pairs.append(pair)
if not highly_correlated_pairs:
    print("No highly correlated features found.")
else:
    print("Highly correlated feature pairs:")
    for pair in highly_correlated_pairs:
        print(pair)

# Data Splitting

In [None]:
#  X contains your independent variables and y contains the target variable
X = ICA.drop(['loan_status'], axis=1)
y = ICA['loan_status']

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# LOGISTIC REGRESSION


In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Define the feature columns (X) and target column (y)
x = ICA.drop(columns=['loan_status'])  # Drop 'loan_status' column to get feature columns
y = ICA['loan_status']  # Target variable

# Select only the numerical columns for scaling (excluding 'loan_status')
numerical_columns = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
                      'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value',
                      'bank_asset_value']

# Apply scaling to the numerical columns
x[numerical_columns] = scaler.fit_transform(x[numerical_columns])

# Display the scaled feature variables (X) and the target variable (y)
print("Scaled Feature Variables (x):")
print(x.head())

print("\nTarget Variable (y):")
print(y.head())

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create a LogisticRegression instance
logistic_reg = LogisticRegression(random_state=42)

# Train the logistic regression model
logistic_reg.fit(x_train, y_train)

# Predict on the test set
y_pred = logistic_reg.predict(x_test)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Generate a classification report and print it
classification_rep = classification_report(y_test, y_pred)
print("\nClassification Report:\n", classification_rep)

# Generate a confusion matrix and plot it with a different color map
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis', xticklabels=logistic_reg.classes_, yticklabels=logistic_reg.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()



In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


y_prob = logistic_reg.predict_proba(x_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR) / Recall / Sensitivity')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np

model = LogisticRegression(random_state=42)


accuracies = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Print the accuracy for each fold
print("Cross-validated Accuracy for each fold:", accuracies)

# Calculate mean and standard deviation
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Print mean and standard deviation
print("Mean Cross-validated Accuracy:", mean_accuracy)
print("Standard Deviation of Cross-validated Accuracy:", std_accuracy)

# Additional information
min_accuracy = np.min(accuracies)
max_accuracy = np.max(accuracies)

print("Minimum Cross-validated Accuracy:", min_accuracy)
print("Maximum Cross-validated Accuracy:", max_accuracy)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np

model = LogisticRegression(random_state=42)


accuracies = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Calculate mean and standard deviation
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Plot the distribution of accuracies
plt.figure(figsize=(8, 6))
sns.histplot(accuracies, bins=10, kde=True, color='skyblue')
plt.axvline(x=mean_accuracy, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy:.3f}\nStd Deviation: {std_accuracy:.3f}')
plt.title('Distribution of Cross-validated Accuracies')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.legend()
plt.show()


# DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Create a DecisionTreeClassifier instance
decision_tree = DecisionTreeClassifier(random_state=42)

# Train the decision tree model
decision_tree.fit(x_train, y_train)

# Predict on the test set
y_pred = decision_tree.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)

# Generate a confusion matrix with a different color map (e.g., 'viridis')
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis')  # Change 'viridis' to your preferred colormap
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Print evaluation metrics separately
print("Decision Tree Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Print the classification report
print("Classification Report:\n", classification_rep)



In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Create a DecisionTreeClassifier instance with 'entropy' as the criterion
decision_tree_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Train the decision tree model
decision_tree_entropy.fit(X_train, y_train)

# Plot the decision tree
plt.figure(figsize=(15, 10))
plot_tree(decision_tree_entropy, feature_names=X_train.columns, class_names=['Rejected', 'Approved'], filled=True, rounded=True)
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Create a DecisionTreeClassifier instance
decision_tree = DecisionTreeClassifier(random_state=42)

accuracies_decision_tree = cross_val_score(decision_tree, X, y, cv=5, scoring='accuracy')

# Print the accuracy for each fold
print("Cross-validated Accuracy for each fold:", accuracies_decision_tree)

# Calculate mean and standard deviation
mean_accuracy_decision_tree = np.mean(accuracies_decision_tree)
std_accuracy_decision_tree = np.std(accuracies_decision_tree)

# Print mean and standard deviation
print("Mean Cross-validated Accuracy (Decision Tree):", mean_accuracy_decision_tree)
print("Standard Deviation of Cross-validated Accuracy (Decision Tree):", std_accuracy_decision_tree)

# Additional information
min_accuracy_decision_tree = np.min(accuracies_decision_tree)
max_accuracy_decision_tree = np.max(accuracies_decision_tree)

print("Minimum Cross-validated Accuracy (Decision Tree):", min_accuracy_decision_tree)
print("Maximum Cross-validated Accuracy (Decision Tree):", max_accuracy_decision_tree)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# Create a DecisionTreeClassifier instance
decision_tree = DecisionTreeClassifier(random_state=42)

# Assuming you have your feature matrix (X) and target variable (y)
# Replace 'X' and 'y' with your actual feature matrix and target variable
accuracies_decision_tree = cross_val_score(decision_tree, X, y, cv=5, scoring='accuracy')

# Plot the distribution of accuracies using a histogram
plt.figure(figsize=(8, 6))
sns.histplot(accuracies_decision_tree, bins=10, kde=True, color='skyblue')
plt.title('Distribution of Cross-validated Accuracies (Decision Tree)')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np

# Create a LogisticRegression instance
model = LogisticRegression(random_state=42)


accuracies = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Calculate mean and standard deviation
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Plot the distribution of accuracies
plt.figure(figsize=(8, 6))
sns.histplot(accuracies, bins=10, kde=True, color='skyblue', element='step', stat='density')
plt.axvline(x=mean_accuracy, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy:.3f}\nStd Deviation: {std_accuracy:.3f}')
plt.title('Distribution of Cross-validated Accuracies (Logistic Regression)')
plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Create an SVC (Support Vector Classification) instance
svm_classifier = SVC(random_state=42)

# Train the SVM model
svm_classifier.fit(x_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)

# Generate a confusion matrix with a different color map (e.g., 'viridis')
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis')  # Change 'viridis' to your preferred colormap
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Print evaluation metrics separately
print("SVM Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Print the classification report
print("Classification Report:\n", classification_rep)



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Create an SVC (Support Vector Classification) instance
svm_classifier = SVC(random_state=42)

# Train the SVM model
svm_classifier.fit(x_train, y_train)

# Predict probabilities for the positive class (class 1)
y_prob = svm_classifier.decision_function(x_test)

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR) / Recall / Sensitivity')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np

# Create an SVC (Support Vector Classification) instance
svm_classifier = SVC(random_state=42)


accuracies_svm = cross_val_score(svm_classifier, X, y, cv=5, scoring='accuracy')

# Calculate mean and standard deviation
mean_accuracy_svm = np.mean(accuracies_svm)
std_accuracy_svm = np.std(accuracies_svm)

# Plot the distribution of accuracies
plt.figure(figsize=(8, 6))
sns.histplot(accuracies_svm, bins=10, kde=True, color='skyblue', element='step', stat='density')
plt.axvline(x=mean_accuracy_svm, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy_svm:.3f}\nStd Deviation: {std_accuracy_svm:.3f}')
plt.title('Distribution of Cross-validated Accuracies (SVM)')
plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Create a KNeighborsClassifier instance with a specified number of neighbors (e.g., n_neighbors=5)
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_classifier.fit(x_train, y_train)

# Predict on the test set
y_pred = knn_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)

# Generate a confusion matrix with a different color map (e.g., 'viridis')
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis')  # Change 'viridis' to your preferred colormap
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Print evaluation metrics separately
print("K-Nearest Neighbors Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Print the classification report
print("Classification Report:\n", classification_rep)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Create a KNeighborsClassifier instance with a specified number of neighbors (e.g., n_neighbors=5)
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_classifier.fit(x_train, y_train)

# Predict probabilities for the positive class (class 1)
y_prob = knn_classifier.predict_proba(x_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR) / Recall / Sensitivity')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Create a KNeighborsClassifier instance with a specified number of neighbors (e.g., n_neighbors=5)
knn_classifier = KNeighborsClassifier(n_neighbors=5)


accuracies_knn = cross_val_score(knn_classifier, X, y, cv=5, scoring='accuracy')

# Calculate mean and standard deviation
mean_accuracy_knn = np.mean(accuracies_knn)
std_accuracy_knn = np.std(accuracies_knn)

# Plot the distribution of accuracies
plt.figure(figsize=(8, 6))
sns.histplot(accuracies_knn, bins=10, kde=True, color='skyblue', element='step', stat='density')
plt.axvline(x=mean_accuracy_knn, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy_knn:.3f}\nStd Deviation: {std_accuracy_knn:.3f}')
plt.title('Distribution of Cross-validated Accuracies (K-Nearest Neighbors)')
plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Create an XGBClassifier instance
xgb_classifier = XGBClassifier(random_state=42)

# Train the XGBoost model
xgb_classifier.fit(x_train, y_train)

# Predict on the test set
y_pred = xgb_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)

# Generate a confusion matrix with a different color map ( 'viridis')
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis')  
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Print evaluation metrics separately
print("XGBoost Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Print the classification report
print("Classification Report:\n", classification_rep)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Create an XGBClassifier instance
xgb_classifier = XGBClassifier(random_state=42)

# Train the XGBoost model
xgb_classifier.fit(x_train, y_train)

# Predict probabilities for the positive class (class 1)
y_prob = xgb_classifier.predict_proba(x_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR) / Recall / Sensitivity')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create an XGBClassifier instance
xgb_classifier = XGBClassifier(random_state=42)

accuracies_xgb = cross_val_score(xgb_classifier, X, y, cv=5, scoring='accuracy')

# Calculate mean and standard deviation
mean_accuracy_xgb = np.mean(accuracies_xgb)
std_accuracy_xgb = np.std(accuracies_xgb)

# Plot the distribution of accuracies
plt.figure(figsize=(8, 6))
sns.histplot(accuracies_xgb, bins=10, kde=True, color='skyblue', element='step', stat='density')
plt.axvline(x=mean_accuracy_xgb, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy_xgb:.3f}\nStd Deviation: {std_accuracy_xgb:.3f}')
plt.title('Distribution of Cross-validated Accuracies (XGBoost)')
plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the Naive Bayes model
nb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap= 'viridis')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Naive Bayes)')
plt.show()


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create a Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()


accuracies_nb = cross_val_score(nb_classifier, X, y, cv=5, scoring='accuracy')

# Calculate mean and standard deviation
mean_accuracy_nb = np.mean(accuracies_nb)
std_accuracy_nb = np.std(accuracies_nb)

# Plot the distribution of accuracies
plt.figure(figsize=(8, 6))
sns.histplot(accuracies_nb, bins=10, kde=True, color='skyblue', element='step', stat='density')
plt.axvline(x=mean_accuracy_nb, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy_nb:.3f}\nStd Deviation: {std_accuracy_nb:.3f}')
plt.title('Distribution of Cross-validated Accuracies (Naive Bayes)')
plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the Naive Bayes model
nb_classifier.fit(X_train, y_train)

# Predict probabilities on the test set
y_prob = nb_classifier.predict_proba(X_test)[:, 1]

# Calculate the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve (Naive Bayes)')
plt.legend(loc='lower right')
plt.show()
