### **1. Importing the required packages**

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

### **2. Reading and Exploring the Data**

1. Import the data
2. Check shape, datatypes of each column in the data.
3. Check for the missing values.
4. Check for the duplicates and remove them.
5. Check for the outliers present in various columns and deal with them.
6. Check for the columns having object types and encode those columns.
7. Necessary Visualizations.

In [3]:
heart = pd.read_csv('heart.csv')

In [4]:
heart.head()  #print the top 5 rows of the data to take a quick look

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
heart.shape #print the total number of rows and columns

(303, 14)

In [6]:
heart.dtypes  #print the datatype of values in each column

Unnamed: 0,0
age,int64
sex,int64
cp,int64
trestbps,int64
chol,int64
fbs,int64
restecg,int64
thalach,int64
exang,int64
oldpeak,float64


**Reasons to check for the datatype of the values present in each column**:

1. To verify whether each column has the correct datatype or not.
2. To check whether there are any object type columns which we will need to encode later.

In [None]:
#change the datatype of a column
#heart['cp'] = heart['cp'].astype('object')

In [7]:
heart.isnull().sum(axis = 0)  #print the total number of missing values column-wise

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [8]:
heart.isnull().sum(axis = 1)  #print the total number of missing values row-wise

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
298,0
299,0
300,0
301,0


In [9]:
#check for duplicates
heart.duplicated().sum()  #tell us the total number of duplicate rows present in the data

np.int64(1)

In [10]:
#print the duplicate rows
heart[heart.duplicated()]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1


In [11]:
#drop/remove the duplicate rows
heart.drop_duplicates(inplace = True)

#### **Checking for outliers**

In [12]:
heart.describe()  #print the statistical summary of the data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [None]:
def remove_outliers(data, columns):
  for column in columns:
    if column in data.columns:
      Q1 = data[column].quantile(0.25)
      Q3 = data[column].quantile(0.75)
      IQR = Q3 - Q1
      lower_bound = Q1 - 1.5 * IQR
      upper_bound = Q3 + 1.5 * IQR
      data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
  return data

In [None]:
remove_outliers(heart, ['age', 'chol', 'thalach'])

### **Machine Learning Process**

1. Create X and y variables to store the input and output columns.
2. Split the data into training and testing sets
3. Standardization/Scaling of the data
4. Apply the Logistic Regression algorithm on the data.
5. Check the performance of the model on the test set.

In [13]:
X = heart.drop(columns = 'target')
y = heart['target']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

#### **Standardization/Scaling of the data**

We should always do the standardization of the data after train-test split

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### **Apply Logistic Regression on the data**

In [17]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

In [22]:
log_reg.coef_

array([[ 0.01185917, -0.60382232,  0.98213584, -0.15983415, -0.3029609 ,
        -0.15069547,  0.12385703,  0.51324165, -0.33865552, -0.73170364,
         0.23597131, -0.78249335, -0.49994703]])

In [23]:
log_reg.intercept_

array([0.27521104])

In [18]:
y_pred = log_reg.predict(X_test_scaled)

In [21]:
y_pred

array([0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0])

In [20]:
log_reg.predict_proba(X_test_scaled)

array([[0.99184208, 0.00815792],
       [0.93439187, 0.06560813],
       [0.08448225, 0.91551775],
       [0.99257794, 0.00742206],
       [0.03508389, 0.96491611],
       [0.20654315, 0.79345685],
       [0.07430371, 0.92569629],
       [0.98723367, 0.01276633],
       [0.09763965, 0.90236035],
       [0.42787258, 0.57212742],
       [0.86964694, 0.13035306],
       [0.49592035, 0.50407965],
       [0.08147662, 0.91852338],
       [0.81642406, 0.18357594],
       [0.02886708, 0.97113292],
       [0.16346754, 0.83653246],
       [0.45730734, 0.54269266],
       [0.6931893 , 0.3068107 ],
       [0.93451095, 0.06548905],
       [0.03568064, 0.96431936],
       [0.95474946, 0.04525054],
       [0.91116173, 0.08883827],
       [0.83572331, 0.16427669],
       [0.83802194, 0.16197806],
       [0.92096411, 0.07903589],
       [0.0190559 , 0.9809441 ],
       [0.92294981, 0.07705019],
       [0.13027014, 0.86972986],
       [0.82470086, 0.17529914],
       [0.92350415, 0.07649585],
       [0.

In [19]:
accuracy_score(y_test, y_pred)

0.8360655737704918

#### **Let's check the confusion matrix**

In [25]:
cm = confusion_matrix(y_test, y_pred)

In [31]:
cm

array([[27,  8],
       [ 2, 24]])

In [26]:
TN, FP, FN, TP = cm.ravel()

In [28]:
print("True Negative:", TN)
print("False Positive:", FP)
print("False Negative:", FN)
print("True Positive:", TP)

True Negative: 27
False Positive: 8
False Negative: 2
True Positive: 24


In [29]:
precision_score(y_test, y_pred)

0.75

In [30]:
recall_score(y_test, y_pred)

0.9230769230769231

### Performance Metrics used in Classification Models

1. **`Confusion Matrix`** : A confusion matrix is a table that summarizes the performance of a classification model by showing the true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).

    - **`When to Use`**: The confusion matrix is useful for understanding the types of errors the model is making and for calculating other metrics like precision, recall, and specificity.
    
![confusion_matrix](https://builtin.com/sites/www.builtin.com/files/styles/ckeditor_optimize/public/inline-images/8_confusion-matrix-python.jpg)

---

2. **`Precision Score`** : Precision is important when the cost of false positives is high, so false positives must be reduced.
    - `For example` :
        - In spam detection, a high precision ensures that most of the emails marked as spam are indeed spam and genuine emails are not marked as spam since it can be a serious problem.
        - In financial fraud detection system, it might prioritize high precision – minimizing false positives (wrongly declined transactions) to avoid inconveniencing customers.
        - While classifying whether or not a bank customer is a loan defaulter, it is desirable to have high precision since the bank wouldn’t want to lose customers who were denied a loan based on the model’s prediction that they would be defaulters.

---

3. **`Recall Score/Sensitivity`** : Recall is crucial when the cost of false negatives is high and we need to eliminate false negatives as much as possible.
    - `For Example`:
        - In medical diagnosis, a high recall is crucial since it ensures that most of the actual positive cases (e.g., diseases) are identified. False Negative would mean that we classified a patient as a healthy person which would be fatal.

---

4. **`Accuracy Score`** : It is suitable to use when the classes in the dataset are balanced. It provides a straightforward measure of overall correctness. However, it can be misleading in cases of class imbalance, where one class significantly outnumbers the other.

    - Using accuracy as a defining metric for our model makes sense intuitively, but more often than not, it is advisable to use Precision and Recall too. There might be other situations where our accuracy is very high, but our precision or recall is low. Ideally, for our model, we would like to avoid any situations where the patient has heart disease, but our model classifies as him not having it, i.e., aim for high recall.

    - On the other hand, for the cases where the patient is not suffering from heart disease and our model predicts the opposite, we would also like to avoid treating a patient with no heart disease (crucial when the input parameters could indicate a different ailment, but we end up treating him/her for a heart ailment).

    - Although we do aim for high precision and high recall value, achieving both at the same time is not possible. For example, if we change the model to one giving us a high recall, we might detect all the patients who actually have heart disease, but we might end up giving treatments to many patients who don’t suffer from it.

    - Similarly, suppose we aim for high precision to avoid giving any wrong and unrequired treatment. In that case, we end up getting a lot of patients who actually have heart disease going without any treatment.

---

5. **`F1 Score`** : We saw above that there is a trade-off between recall and precision-score. When we try to increase one, the other one starts to reduce. But sometime both the score are important.
    - `Ex`:  if the doctor informs us that the patients who were incorrectly classified as suffering from heart disease are equally important since they could be indicative of some other ailment, then we would aim for not only a high recall but a high precision as well.
    
---


6. **`ROC_AUC Score`** - Used commonly with imbalanced data.

---

7. **`Specificity`** : Specificity is a performance metric used in classification models, particularly in binary classification. It is also known as the true negative rate. Specificity measures the proportion of actual negative cases that are correctly identified by the model. This metric tells us how well the model is at identifying negative instances.
    - `Ex` : In medical testing, a high specificity is important when a false positive result could lead to unnecessary stress, further invasive testing, or treatment. For example, in cancer screening, a test with high specificity ensures that healthy individuals are not incorrectly diagnosed with cancer, avoiding unnecessary biopsies or treatments

Q1.You mentioned on Thursday that these 13 columns are input columns and the last one is the output column. But is this decision made by data engineers? Can you show or share the PDF that explains the process of how they decide which ones are input columns and which one is the output column?