In [7]:
import opendatasets as od
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load Data
df = pd.read_csv('glass.csv')

print(df.shape)
print(df.columns) 
df.head()

(214, 10)
Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'], dtype='str')


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


Are all columns numeric?

In [3]:
# Yes they are all numbers
df.dtypes

#Is there an ID column that should not be used? => No

RI      float64
Na      float64
Mg      float64
Al      float64
Si      float64
K       float64
Ca      float64
Ba      float64
Fe      float64
Type      int64
dtype: object

Which column is the output we want to predict?

In [4]:
# here we can't figure what the target 
# => Therefore, we need to understand the data and create target "y"

# create binary classes
df['y'] = (df['Type']==1).astype(int)

df = df.drop("Type", axis=1)
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,y
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


Separating train & test

In [6]:
X = df.drop("y", axis=1)
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape) 
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(171, 9)
(43, 9)
(171,)
(43,)


Normalizing the data

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

Model Implementation

In [None]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

def predict_prob(X, w, b):
    z = X @ w + b
    p = sigmoid(z)
    return p

LogLoss function

In [11]:
def loss(y, p):
    return -np.mean(y*np.log(p) + (1-y)*np.log(1-p))

In [13]:
def update_weights(X, y, w, b, lr):
    p = predict_prob(X, w, b)
    err = p-y

    w = w - lr*(X.T @ err)/len(y)
    b = b - lr*np.mean(err)
    
    return w, b

Training phase

In [15]:
w = np.zeros(X_train.shape[1])
b = 0.0
lr = 0.1
epochs = 100
for i in range(epochs):
    w, b = update_weights(X_train, y_train, w, b, lr)
    
print(w)
print(b)

[ 0.10796659 -0.15606355  0.63480916 -0.67204019  0.07051998 -0.08936205
 -0.19026934 -0.16364966 -0.1099809 ]
-0.7346250485229568


Probability to Decision

In [18]:
def predict(p, threshold = 0.5):
    return (p >= threshold).astype(int)

probs = predict_prob(X_train, w, b)

y_pred_50 = predict(probs, 0.5)
y_pred_70 = predict(probs, 0.7)

print(y_pred_50)
print(y_pred_70)

[0 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 0
 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0
 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 1 0 0 1 0 0
 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


Why higher threshold is safer in glass quality control?

Ans. Because we need to minimise false acceptance and we can do false reject. 

---

### Conclusions:
1. Logistic reg is different from the perceptron because perceptron gives hard 0 or 1 whereas logistic reg gives the probability.
2. Sigmoid matters because it tells how confident it is wrt to target. Example, sigmoid give 0.8 means "80% chance hai ki yeh glass Type 1 hai". Therefore, we can understand uncertainity too.
3. Remaining problem is that the model predict 0/1 (yes/no) that the sample is type 1 or not. And not able to identify other types at once. (multiclass classification)
