<a href="https://colab.research.google.com/github/PXDHU/Machine-Learning/blob/main/ML_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Logistic Regression**

##1. Design a binary classifier for classifying the following emails as spam (1) or ham (0). Use email.csv for training.

* Print all the parameter values learnt after training

* Show the accuracy on the test set.

* Test emails

* Assume first four are spam and the last four are not spam

In [None]:
import pandas as pd

df = pd.read_csv('email.csv')
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,class
0,1,1,1,0,1,1,1
1,1,0,0,1,1,0,0
2,1,0,1,1,0,0,1
3,1,1,0,0,1,0,0
4,1,1,0,1,0,1,1
5,0,1,1,1,1,0,0
6,1,1,1,0,1,1,1
7,1,0,0,1,1,0,0
8,1,0,1,1,0,0,1
9,1,1,0,0,1,0,0


In [None]:
import numpy as np
import math

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic(theta, X):
    return sigmoid(np.dot(X, theta))

def cost(theta, X, y):
    m = len(y)
    h = logistic(theta, X)
    J = (-1/m) * np.sum(y*np.log(h) + (1-y)*np.log(1-h))
    return J

def gradient(theta, X, y):
    m = len(y)
    h = logistic(theta, X)
    grad = (1/m) * np.dot(X.T, (h-y))
    return grad

def logistic_regression(X, y, alpha, num_iters):
    m, n = X.shape
    theta = np.zeros(n)
    
    for i in range(num_iters):
        cost_i = cost(theta, X, y)
        grad = gradient(theta, X, y)
        theta = theta - alpha * grad
    
    return theta

X = df.iloc[:, :5].values
Y = df.iloc[:, -1].values

alpha = 0.01
num_iters = 1000

theta  = logistic_regression(X, Y, alpha, num_iters)

print("Learned parameters: ", theta)



Learned parameters:  [ 0.05639547  0.23800884  1.09471982  0.04847253 -1.13567832]


In [None]:
def predict(theta, X):
    y_pred = sigmoid(np.dot(X, theta))
    return (y_pred > 0.5).astype(int)

#test data provided in question
x = np.array([[0, 1, 0,0, 0],
              [1, 1, 1, 0, 1],
              [0, 1, 1, 0, 0],
              [1, 0, 1, 0, 0],
              [0, 1, 0, 0, 1],
              [0, 0, 0, 1, 1],
              [0, 1, 0, 1, 1],
              [0, 0, 0, 0, 1]])

y = np.array([1, 1, 1, 1, 0, 0, 0, 0])

y_pred = predict(theta, x)

correct = 0

for i, j in zip(y_pred, y):
    if (y_pred[i] == y[i]):
        correct += 1
accuracy = correct / len(y) 

print(f"Predicted output is: {y_pred}")
print(f"Accuracy of model is: {accuracy}")
    


Predicted output is: [1 1 1 1 0 0 0 0]
Accuracy of model is: 1.0


In [None]:
#Q2. Use appropriate Scikit Library function to apply logistic regression on the same dataset and compare the results with your implementation

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()

lr.fit(X, Y)


y_pred_sklearn = lr.predict(x)

acc = accuracy_score(y, y_pred_sklearn)

print("Accuracy: {:.2f}".format(acc))

Accuracy: 1.00


In [None]:
# Q3.Also use logistic regression model to predict the risk of having heart disease using the given dataset (heart.csv).
# You need to show complete pre-processing steps (identifying null or missing values, normalization, etc.)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
df = pd.read_csv("heart (1).csv")
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [None]:
print(df.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [None]:
X = df.drop("target", axis=1)
y = df["target"]

X , y

(      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
 0      52    1   0       125   212    0        1      168      0      1.0   
 1      53    1   0       140   203    1        0      155      1      3.1   
 2      70    1   0       145   174    0        1      125      1      2.6   
 3      61    1   0       148   203    0        1      161      0      0.0   
 4      62    0   0       138   294    1        1      106      0      1.9   
 ...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
 1020   59    1   1       140   221    0        1      164      1      0.0   
 1021   60    1   0       125   258    0        0      141      1      2.8   
 1022   47    1   0       110   275    0        0      118      1      1.0   
 1023   50    0   0       110   254    0        0      159      0      0.0   
 1024   54    1   0       120   188    0        1      113      0      1.4   
 
       slope  ca  thal  
 0         2   2     3  
 1         0

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(acc))

Accuracy: 0.80
