## Phase 1

In [1]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns
from scipy.stats import skew

## Data
https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction

In [2]:
application_file = r'data/application_record.csv'
credit_file = r'data/credit_record.csv'

In [3]:
df_application = pd.read_csv(application_file)
df_credit = pd.read_csv(credit_file)
print(df_application.shape)
print(df_credit.shape)

(438557, 18)
(1048575, 3)


In [4]:
df_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [5]:
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


### 1.Removing duplicates

In [6]:
df_application = df_application.drop_duplicates()
df_credit = df_credit.drop_duplicates()
df_application.reset_index(inplace=True)
df_application.drop(columns=['index'],inplace=True)
df_credit.reset_index(inplace=True)
df_credit.drop(columns=['index'],inplace=True)

### 2.Adding new columns

In [7]:
df_application['AGE'] = (df_application['DAYS_BIRTH'] / -365.25).astype(int)

In [8]:
df_application['AGE'].describe()

count    438557.000000
mean         43.295569
std          11.459525
min          20.000000
25%          34.000000
50%          42.000000
75%          53.000000
max          68.000000
Name: AGE, dtype: float64

In [9]:
df_application.query('DAYS_EMPLOYED==365243')['OCCUPATION_TYPE'].drop_duplicates()

7    NaN
Name: OCCUPATION_TYPE, dtype: object

### 3.Filtering out rows based on value

In [10]:
df_application['DAYS_EMPLOYED'] = df_application['DAYS_EMPLOYED'].where(df_application['DAYS_EMPLOYED']!=365243,0)

In [11]:
df_application['DAYS_EMPLOYED'].sort_values()

102186   -17531
102185   -17531
102187   -17531
102188   -17531
102189   -17531
          ...  
174975        0
174974        0
174973        0
156723        0
268401        0
Name: DAYS_EMPLOYED, Length: 438557, dtype: int64

In [12]:
df_application['TOTAL_EXPERIENCE'] = (df_application['DAYS_EMPLOYED'] / -365.25).astype(int)

### 4.Filling in missing values

In [13]:
df_application['OCCUPATION_TYPE'].fillna("No Job",inplace=True)

In [14]:
df_application['OCCUPATION_TYPE'].unique()

array(['No Job', 'Security staff', 'Sales staff', 'Accountants',
       'Laborers', 'Managers', 'Drivers', 'Core staff',
       'High skill tech staff', 'Cleaning staff', 'Private service staff',
       'Cooking staff', 'Low-skill Laborers', 'Medicine staff',
       'Secretaries', 'Waiters/barmen staff', 'HR staff', 'Realty agents',
       'IT staff'], dtype=object)

In [15]:
df_application['OCCUPATION_TYPE'].value_counts()

No Job                   134203
Laborers                  78240
Core staff                43007
Sales staff               41098
Managers                  35487
Drivers                   26090
High skill tech staff     17289
Accountants               15985
Medicine staff            13520
Cooking staff              8076
Security staff             7993
Cleaning staff             5845
Private service staff      3456
Low-skill Laborers         2140
Secretaries                2044
Waiters/barmen staff       1665
Realty agents              1041
HR staff                    774
IT staff                    604
Name: OCCUPATION_TYPE, dtype: int64

##### All the other columns have valid values without nulls

In [16]:
df_application.isna().value_counts()

ID     CODE_GENDER  FLAG_OWN_CAR  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  NAME_INCOME_TYPE  NAME_EDUCATION_TYPE  NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  FLAG_MOBIL  FLAG_WORK_PHONE  FLAG_PHONE  FLAG_EMAIL  OCCUPATION_TYPE  CNT_FAM_MEMBERS  AGE    TOTAL_EXPERIENCE
False  False        False         False            False         False             False             False                False               False              False       False          False       False            False       False       False            False            False  False               438557
dtype: int64

In [17]:
df_application.dtypes

ID                       int64
CODE_GENDER             object
FLAG_OWN_CAR            object
FLAG_OWN_REALTY         object
CNT_CHILDREN             int64
AMT_INCOME_TOTAL       float64
NAME_INCOME_TYPE        object
NAME_EDUCATION_TYPE     object
NAME_FAMILY_STATUS      object
NAME_HOUSING_TYPE       object
DAYS_BIRTH               int64
DAYS_EMPLOYED            int64
FLAG_MOBIL               int64
FLAG_WORK_PHONE          int64
FLAG_PHONE               int64
FLAG_EMAIL               int64
OCCUPATION_TYPE         object
CNT_FAM_MEMBERS        float64
AGE                      int64
TOTAL_EXPERIENCE         int64
dtype: object

### 5.Conversion of string data to number/data values
###### making sure all the columns are strings before applying encoding

In [18]:
df_application['CODE_GENDER'] = df_application['CODE_GENDER'].astype(str)
df_application['FLAG_OWN_CAR'] = df_application['FLAG_OWN_CAR'].astype(str)
df_application['FLAG_OWN_REALTY'] = df_application['FLAG_OWN_REALTY'].astype(str)
df_application['NAME_INCOME_TYPE'] = df_application['NAME_INCOME_TYPE'].astype(str)
df_application['NAME_EDUCATION_TYPE'] = df_application['NAME_EDUCATION_TYPE'].astype(str)
df_application['NAME_FAMILY_STATUS'] = df_application['NAME_FAMILY_STATUS'].astype(str)
df_application['NAME_HOUSING_TYPE'] = df_application['NAME_HOUSING_TYPE'].astype(str)
df_application['OCCUPATION_TYPE'] = df_application['OCCUPATION_TYPE'].astype(str)

### 6.changing categorical columns using encoding

In [19]:
CODE_GENDER_CONVERSION = {'F': 0, 'M': 1}
FLAG_OWN_CAR_CONVERSION = {'N': 0, 'Y': 1}
FLAG_OWN_REALTY_CONVERSION = {'N': 0, 'Y': 1}
NAME_INCOME_TYPE_CONVERSION = {'Commercial associate': 0, 'Pensioner': 1, 'State servant': 2, 'Student': 3, 'Working': 4}
NAME_FAMILY_STATUS_CONVERSION = {'Civil marriage': 0, 'Married': 1, 'Separated': 2, 'Single / not married': 3, 'Widow': 4}
OCCUPATION_TYPE_CONVERSION = {'Accountants': 0, 'Cleaning staff': 1, 'Cooking staff': 2, 'Core staff': 3, 'Drivers': 4, 'HR staff': 5, 'High skill tech staff': 6, 'IT staff': 7, 'Laborers': 8, 'Low-skill Laborers': 9, 'Managers': 10, 'Medicine staff': 11, 'Private service staff': 12, 'Realty agents': 13, 'Sales staff': 14, 'Secretaries': 15, 'Security staff': 16, 'Waiters/barmen staff': 17,'No Job':18}
NAME_HOUSING_TYPE_CONVERSION = {'Co-op apartment': 0, 'House / apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Rented apartment': 4, 'With parents': 5}
NAME_EDUCATION_TYPE_CONVERSION = {'Academic degree': 0, 'Higher education': 1, 'Incomplete higher': 2, 'Lower secondary': 3, 'Secondary / secondary special': 4}

In [20]:
df_application.replace({'CODE_GENDER' : CODE_GENDER_CONVERSION}, inplace=True)
df_application.replace({'FLAG_OWN_CAR' : FLAG_OWN_CAR_CONVERSION}, inplace=True)
df_application.replace({'FLAG_OWN_REALTY' : FLAG_OWN_REALTY_CONVERSION}, inplace=True)
df_application.replace({'NAME_INCOME_TYPE' : NAME_INCOME_TYPE_CONVERSION}, inplace=True)
df_application.replace({'NAME_FAMILY_STATUS' : NAME_FAMILY_STATUS_CONVERSION}, inplace=True)
df_application.replace({'OCCUPATION_TYPE' : OCCUPATION_TYPE_CONVERSION}, inplace=True)
df_application.replace({'NAME_HOUSING_TYPE' : NAME_HOUSING_TYPE_CONVERSION}, inplace=True)
df_application.replace({'NAME_EDUCATION_TYPE' : NAME_EDUCATION_TYPE_CONVERSION}, inplace=True)

In [21]:
df_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  int64  
 2   FLAG_OWN_CAR         438557 non-null  int64  
 3   FLAG_OWN_REALTY      438557 non-null  int64  
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  int64  
 7   NAME_EDUCATION_TYPE  438557 non-null  int64  
 8   NAME_FAMILY_STATUS   438557 non-null  int64  
 9   NAME_HOUSING_TYPE    438557 non-null  int64  
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

### 7.Creating target column

In [22]:
Status_definition = {'C' : 'Good_Debt', 'X' : 'Good_Debt', '0' : 'Good_Debt', '1' : 'Neutral_Debt', '2' : 'Neutral_Debt', '3' : 'Bad_Debt', '4' : 'Bad_Debt', '5' : 'Bad_Debt'}
df_credit.replace({'STATUS' : Status_definition}, inplace=True)

In [23]:
df_credit = df_credit.value_counts(subset=['ID', 'STATUS']).unstack(fill_value=0)

In [24]:
df_credit

STATUS,Bad_Debt,Good_Debt,Neutral_Debt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5001711,0,4,0
5001712,0,19,0
5001713,0,22,0
5001714,0,15,0
5001715,0,60,0
...,...,...,...
5150482,0,18,0
5150483,0,18,0
5150484,0,13,0
5150485,0,2,0


In [None]:
df_credit.loc[(df_credit['Good_Debt'] > df_credit['Neutral_Debt']), 'CREDIT_APPROVAL_STATUS'] = 1
df_credit.loc[(df_credit['Good_Debt'] > df_credit['Bad_Debt']), 'CREDIT_APPROVAL_STATUS'] = 1
df_credit.loc[(df_credit['Neutral_Debt'] > df_credit['Good_Debt']), 'CREDIT_APPROVAL_STATUS'] = 0
df_credit.loc[(df_credit['Neutral_Debt'] > df_credit['Bad_Debt']), 'CREDIT_APPROVAL_STATUS'] = 1
df_credit.loc[(df_credit['Bad_Debt'] > df_credit['Good_Debt']), 'CREDIT_APPROVAL_STATUS'] = 0
df_credit.loc[(df_credit['Bad_Debt'] > df_credit['Neutral_Debt']), 'CREDIT_APPROVAL_STATUS'] = 0

In [None]:
df_credit.columns

### 8. Dropping unwanted columns

In [None]:
df_credit.drop(columns=['Bad_Debt', 'Good_Debt', 'Neutral_Debt'],inplace=True)

In [None]:
df_credit.head()

### 9. Joining multiple datasets

In [None]:
merged_df = pd.merge(df_credit, df_application, on='ID', how='inner')

In [None]:
merged_df.count()

## Exploratory Data Analysis

### 1.Summary Statistics

In [None]:
merged_df.describe().T

### 2.Correlation Analysis

In [None]:
merged_df.corr().T

### 3.Heatmap to show the correlation

In [None]:

fig, ax = plt.subplots(figsize=(12, 10))

# Plot the heatmap
im = ax.imshow(merged_df.corr(), cmap='coolwarm', vmin=-1, vmax=1)

# Create a colorbar
cbar = ax.figure.colorbar(im, ax=ax)

# Set the colorbar label
cbar.ax.set_ylabel('Correlation', rotation=-90, va="bottom")

# Set the title and axis labels
ax.set_title('Correlation Heatmap')
ax.set_xlabel('Features')
ax.set_ylabel('Features')

# Create tick labels for x and y axis
tick_labels = [col for col in merged_df.columns]
ax.set_xticks(np.arange(len(tick_labels)))
ax.set_yticks(np.arange(len(tick_labels)))
ax.set_xticklabels(tick_labels, rotation=45, ha="right")
ax.set_yticklabels(tick_labels)

##### FLAG_MOBIL does not have any impact on the data so dropping that column

In [None]:
merged_df.drop(columns=['FLAG_MOBIL'],inplace=True)

In [None]:
# AMT_INCOME_TOTAL and CREDIT_APPROVAL_STATUS 
plt.scatter(merged_df['AMT_INCOME_TOTAL'], merged_df['CREDIT_APPROVAL_STATUS'])

plt.xlabel('AMT_INCOME_TOTAL')

plt.ylabel('CREDIT_APPROVAL_STATUS')

plt.title('Relationship between AMT_INCOME_TOTAL and CREDIT_APPROVAL_STATUS ')

##### AMT_INCOME_TOTAL have impact on application approval status 

In [None]:
# CODE_GENDER and CREDIT_APPROVAL_STATUS 
merged_df.plot.scatter(x='CODE_GENDER', y='CREDIT_APPROVAL_STATUS')

plt.xlabel('CODE_GENDER')

plt.ylabel('CREDIT_APPROVAL_STATUS')

plt.title('Relationship between CODE_GENDER and CREDIT_APPROVAL_STATUS ')

##### CODE_GENDER does not much influence the approval status

### 4.Outlier Detection

In [None]:
plt.boxplot(merged_df['CNT_CHILDREN'])

##### Removing the outliers from the cnt_children column

In [None]:
merged_df[merged_df['CNT_CHILDREN']>=17.5]

In [None]:
merged_df = merged_df[merged_df['CNT_CHILDREN']<17.5]

In [None]:
plt.boxplot(merged_df['CNT_FAM_MEMBERS'])

In [None]:
merged_df[merged_df['CNT_FAM_MEMBERS']>=12]

In [None]:
merged_df = merged_df[merged_df['CNT_FAM_MEMBERS']<12]

### DATA CLEANING
### 10. reset index

In [None]:
merged_df = merged_df.reset_index()
merged_df.drop(columns = ['index'],inplace=True)

### EDA
### 5. Using bar chart to find distribution of categorical data

In [None]:
children_counts = merged_df['CNT_CHILDREN'].value_counts()
children_counts.plot(kind='bar')

plt.title('Distribution of No. of Children')

plt.xlabel('Number of Children')
plt.ylabel('Count')

plt.xticks(range(len(children_counts)), children_counts.index)

plt.show()
print("From the bar chart we have observed that there are compartively very less number of families with 3 or more children")

In [None]:
income_type_counts = merged_df['NAME_INCOME_TYPE'].value_counts()
income_type_counts.plot(kind='bar')

plt.title('Distribution of Income types')

plt.xlabel('Type of Income')
plt.ylabel('Count')

plt.xticks(range(len(income_type_counts)), income_type_counts.index)

plt.show()


### 6. Histogram to find the frequency of observations

In [None]:
plt.hist(merged_df['AMT_INCOME_TOTAL'])
plt.title('Income range graph')
plt.xlabel('Income obtained')
plt.ylabel('Count')

plt.show()

In [None]:
plt.hist(merged_df['NAME_EDUCATION_TYPE'], bins=10)

plt.title('Distribution of Education Types')

plt.xlabel('Education Type')
plt.ylabel('Count')

plt.show()

print(NAME_EDUCATION_TYPE_CONVERSION)
print("Applicants have mostly completed their secondary school, also some applicants have even completed their higher education")

### 7. Using skew to find the skewness in the data

In [None]:
plt.hist(merged_df['OCCUPATION_TYPE'], bins=10)

plt.title('Distribution of Occupation Types')

plt.xlabel('Occupation Type')
plt.ylabel('Count')

plt.show()

print(OCCUPATION_TYPE_CONVERSION,"\n")
print("There are high varieties of Occupation types in our dataset")

In [None]:
occupation_skew = skew(merged_df['OCCUPATION_TYPE'])
print(occupation_skew)
print("The skewness value is negative for occupation type. \nThis indicates that the distribution of data is negatively skewed, with a long tail towards the left. This means that the majority of the data is concentrated towards the right side of the distribution")

### 8. PIE Chart to find proportion of applicants

In [None]:
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'orange']

plt.pie(merged_df['NAME_FAMILY_STATUS'].value_counts(), labels=merged_df['NAME_FAMILY_STATUS'].unique(), colors=colors, autopct='%1.1f%%')

plt.title('Distribution of Family Status')
plt.show()
print("Around 70% of our applicatns are married")

In [None]:
colors = ['blue', 'red', 'skyblue', 'violet', 'green']

plt.pie(merged_df['NAME_HOUSING_TYPE'].value_counts(), labels=merged_df['NAME_HOUSING_TYPE'].unique(), colors=colors, autopct='%1.1f%%')

plt.title('Distribution of Hosuing types')
plt.show()
print(NAME_HOUSING_TYPE_CONVERSION)
print("Around 90% of our applicatns are living in rented houses")

In [None]:
colors = ['Yellow', 'red', 'Black', 'violet', 'green','blue']

plt.pie(merged_df['CNT_FAM_MEMBERS'].value_counts(), labels=merged_df['CNT_FAM_MEMBERS'].unique(), colors=colors, autopct='%1.1f%%')

plt.title('Distribution of Hosuing types')
plt.show()

print("Around 54% of our applicatns are having 2 children")

### 9. Line Graph to see the impact of one column on the other

In [None]:
x = merged_df['CREDIT_APPROVAL_STATUS']
y = merged_df['AGE']
plt.plot(x, y)

# Add title and labels
plt.title('AGE vs. CREDIT_APPROVAL_STATUS')
plt.xlabel('CREDIT_APPROVAL_STATUS')
plt.ylabel('Age')

plt.show()

print("Approval is not that much impacted by age")

In [None]:
x = merged_df['CREDIT_APPROVAL_STATUS']
y = merged_df['TOTAL_EXPERIENCE']
plt.plot(x, y)

# Add title and labels
plt.title('Experience vs. CREDIT_APPROVAL_STATUS')
plt.xlabel('Experience')
plt.ylabel('CREDIT_APPROVAL_STATUS')

plt.show()

print("With increase in experience the approval had impact and people with more experience mostly got approval")

### 10. Violin graph to visualize distribution of numerical data across various categories

In [None]:
sns.violinplot(x='NAME_EDUCATION_TYPE', y='AMT_INCOME_TOTAL', data=merged_df)

print(NAME_EDUCATION_TYPE_CONVERSION)
print("\n \nThe applicants who have registered and not capable of completing higher have more salaries compared with other categories")

## Phase 2

In [None]:
merged_df.columns

### Normalizing the data for all numerical columns

In [None]:
from sklearn.preprocessing import StandardScaler

# Scaling numerical columns
numerical_cols = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 
                  'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS', 'AGE', 'TOTAL_EXPERIENCE']
scaler = StandardScaler()
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols])

### Importing all the required libraries for regression and plotting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score,recall_score
from sklearn.metrics import confusion_matrix, classification_report, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop(columns=['ID','CREDIT_APPROVAL_STATUS'])
                                                    , merged_df['CREDIT_APPROVAL_STATUS']
                                                    , test_size=0.20
                                                    , random_state=90)

In [None]:
accuracies = {}

### Logistic Regression

In [None]:
logit = LogisticRegression(max_iter=10000).fit(X_train, y_train)

In [None]:
logit.predict(X_test)

In [None]:
y_pred = logit.predict(X_test)
score = accuracy_score(y_test, y_pred)

In [None]:
accuracies.update({'Logistic Regression':score})
print(accuracies['Logistic Regression'])

#### Creating a confusion matrix

In [None]:

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Plot confusion matrix as heatmap
labels = ['Negative', 'Positive']
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


#### Precision-recall Graph
#### precision measuring the accuracy of positive predictions
#### recall measures the completeness of positive predictions

In [None]:
#calculating precision and reall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
 
print('Precision: ',precision)
print('Recall: ',recall)
 
#Plotting Precision-Recall Curve
disp = PrecisionRecallDisplay.from_predictions(y_test, y_pred)

#### Pairplot

In [None]:
graph_df = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})
sns.pairplot(graph_df)

#### Creating a classification report to compare y_test and y_pred

In [None]:
print(classification_report(y_test, y_pred, zero_division=1))

#### auc - roc_curve 

In [None]:
y_pred = logit.predict_proba(X_test)[:, 1]
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred)

### Decision Trees

In [None]:
tree = DecisionTreeClassifier()

In [None]:
tree.fit(X_train, y_train)

In [None]:
y_pred = tree.predict(X_test)

In [None]:
score = accuracy_score(y_test, y_pred)
accuracies.update({'DecisionTreeClassifier':score})
print(accuracies['DecisionTreeClassifier'])

#### Creating a confusion matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Plot confusion matrix as heatmap
labels = ['Negative', 'Positive']
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#### Precision-recall Graph
#### precision measuring the accuracy of positive predictions
#### recall measures the completeness of positive predictions

In [None]:
#calculating precision and reall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
 
print('Precision: ',precision)
print('Recall: ',recall)
 
#Plotting Precision-Recall Curve
disp = PrecisionRecallDisplay.from_predictions(y_test, y_pred)

#### Pairplot

In [None]:
graph_df = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})
sns.pairplot(graph_df)

#### Creating a classification report to compare y_test and y_pred

In [None]:
print(classification_report(y_test, y_pred, zero_division=1))

#### auc - roc_curve

In [None]:
y_pred = tree.predict_proba(X_test)[:, 1]
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred)

### Support vector machine

In [None]:
svm = SVC(kernel='linear', probability=True)

In [None]:
svm.fit(X_train, y_train)

In [None]:
y_pred = svm.predict(X_test)

In [None]:
score = accuracy_score(y_test, y_pred)
accuracies.update({'Support Vector Machine':score})
print(accuracies['Support Vector Machine'])

#### Creating a confusion matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Plot confusion matrix as heatmap
labels = ['Negative', 'Positive']
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#### Precision-recall Graph
#### Precision measuring the accuracy of positive predictions
#### Recall measures the completeness of positive predictions

In [None]:
#calculating precision and reall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
 
print('Precision: ',precision)
print('Recall: ',recall)
 
#Plotting Precision-Recall Curve
disp = PrecisionRecallDisplay.from_predictions(y_test, y_pred)

#### Pairplot

In [None]:
graph_df = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})
sns.pairplot(graph_df)

#### Creating a classification report to compare y_test and y_pred

In [None]:
print(classification_report(y_test, y_pred, zero_division=1))

#### auc - roc_curve

In [None]:
y_pred = svm.predict_proba(X_test)[:, 1]
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred)

### Random Forest

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
score = accuracy_score(y_test, y_pred)
accuracies.update({'RandomForestClassifier':score})
print(accuracies['RandomForestClassifier'])

#### Creating a confusion matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Plot confusion matrix as heatmap
labels = ['Negative', 'Positive']
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#### Precision-recall Graph
#### Precision measuring the accuracy of positive predictions
#### Recall measures the completeness of positive predictions

In [None]:
#calculating precision and reall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
 
print('Precision: ',precision)
print('Recall: ',recall)
 
#Plotting Precision-Recall Curve
disp = PrecisionRecallDisplay.from_predictions(y_test, y_pred)

#### Pairplot

In [None]:

graph_df = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})
sns.pairplot(graph_df)

#### Creating a classification report to compare y_test and y_pred

In [None]:
print(classification_report(y_test, y_pred, zero_division=1))

#### auc - roc_curve

In [None]:
y_pred = rfc.predict_proba(X_test)[:, 1]
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred)

### Gradient Boosting Machines (GBM)

In [None]:
gb = GradientBoostingClassifier()

In [None]:
gb.fit(X_train, y_train)

In [None]:
y_pred = gb.predict(X_test)

In [None]:
score = accuracy_score(y_test, y_pred)
accuracies.update({'Gradient Boosting Machine':score})
print(accuracies['Gradient Boosting Machine'])

#### Creating a confusion matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Plot confusion matrix as heatmap
labels = ['Negative', 'Positive']
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#### Precision-recall Graph
#### Precision measuring the accuracy of positive predictions
#### Recall measures the completeness of positive predictions

In [None]:
#calculating precision and reall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
 
print('Precision: ',precision)
print('Recall: ',recall)
 
#Plotting Precision-Recall Curve
disp = PrecisionRecallDisplay.from_predictions(y_test, y_pred)

#### Pairplot

In [None]:
graph_df = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})
sns.pairplot(graph_df)

#### Creating a classification report to compare y_test and y_pred

In [None]:
print(classification_report(y_test, y_pred, zero_division=1))

#### auc - roc_curve 

In [None]:
y_pred = gb.predict_proba(X_test)[:, 1]
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred)

In [None]:
accuracies