In [None]:
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
    horizontal-align: middle;
}


h1{
    text-align: center;
    border-style: solid;
    border-width: 3px;
    padding: 12px;
    margin: 0;
    color: black;
    font-family: ariel;
    border-radius: 80px;
    border-color: gold;
}


body, p {
    font-family: ariel;
    font-size: 15px;
    color: charcoal;
}
div {
    font-size: 14px;
    margin: 0;

}

h4 {
    padding: 0px;
    margin: 0;
    font-family: ariel;
    color: purple;
}
</style>
""")

<h1 align = "center" style = "font-family:serif"> 🧬 Machine Learning Models from Scratch 🧪 </h1>

<p style = "font-size:17px">Here I have tried to code simple versions of very famous and simple Machine Learning Models based on my understanding of their working. I have also tried to explain my work and visualize some of it using dummy datasets. I will try to add some more models too and update my work in the coming days.</p>

<b style = "font-size:17px">Any feedback is appreciated and hope I can get your help to improve these models and my understanding of their working.</b>

<h2 >📌 Models</h2>
<ul style = "font-size:17px">
    <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#LinReg">Linear Regression</a></li>
    <ul>
        <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#SGD">Stochastic Gradient Descent</a></li>
        <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#OLS">Ordinary Least Squares</a></li>
    </ul>
    <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#LogReg">Logistic Regression</a></li>
    <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#KMeans">KMeans Clustering</a></li>
    <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#KNN">KNN</a></li>
    <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#GNB">Gaussian Naive Bayes</a></li>
    <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#DTC">Decision Tree Classifier</a></li> 
    <li><a href="https://www.kaggle.com/code/varunnagpalspyz/ml-models-code-from-scratch#RFC">Random Forest Classifier</a></li>
</ul>

***

## 📦 Importing Libraries

In [None]:
# Basic Imports
import numpy as np
import random

#to visualize and verify my work
import matplotlib.pyplot as plt
%matplotlib inline

# To verify my work on dummy datasets using sklearn
from sklearn.datasets import make_blobs,make_regression,make_classification
from sklearn.model_selection import train_test_split

##  🆘 Helper Functions

In [None]:
''' Utils '''

def mode(ls):
    ''' function to find the mode of a list '''
    
    # dictionary to keep count of each value
    counts = {}
    # iterate through the list
    for item in ls:
        if item in counts:
            counts[item] += 1
        else:
            counts[item] = 1
    # get the keys with the max counts
    return [key for key in counts.keys() if counts[key] == max(counts.values())]

def sigmoid(z):
    ''' function to find sigmoid '''
    return 1/(1+np.exp(-z))

def mse(y,yhat):
    ''' Mean Squared Error loss function '''
    return np.average((y - yhat) ** 2, axis=0)

def logloss():
    ''' Binary Cross Entropy loss function '''
    pass

<h1 align = "center" id = "LinReg" >📏 Linear Regression</h1>

<h2 id = "SGD"> Stochastic Gradient Descent </h2>

<b>For more information on Gradient Descent check out:</b> https://www.youtube.com/watch?v=ORyfPJypKuU&t=2s

In [None]:
class SGDRegressor:
    
    def __init__(self,epochs = 1000,lr = 0.001):
        ''' initializing the epochs and learning rate parameters '''
        
        # defining number of epochs i.e. number of times we loop through the entire data
        self.epochs = epochs
        # defining learning rate for gradient descent
        self.lr = lr 
        
    def fit(self,x,y):
        ''' function to train the model '''
        
        # taking the number of examples and number of features in x
        self.num_samples,self.num_features = x.shape 
        # weights
        self.w = np.zeros(self.num_features) 
        # bias
        self.b = 0 
        # updating weights after every example in every epoch
        for i in range(self.epochs):
            self.update_weights(x,y) 
        
    def predict(self,x):
        ''' Predicting the output based on weights and biases set after training '''
        return np.dot(x,self.w)+self.b 
    
    def update_weights(self,x,y):
        ''' Training and updating the weights based on Gradient Descent '''
        
        for i in range(self.num_samples):
            row = x[i].T
            #taking prediction from each sample in the data to update the w&b
            ypred = self.predict(row)
            # calculating update gradients
            dw = -2*np.dot(row,y[i]-ypred) 
            db = - 2*np.sum(y[i]-ypred) 
            # applying gradient descent to update the weights and bias
            self.w = self.w - self.lr*dw 
            self.b = self.b - self.lr*db

<h2 id = "OLS"> Normal Equations (Ordinary Least Squares)</h2>

<b>For more information regarding the derivation of these formulae check out:</b> https://www.youtube.com/playlist?list=PLKnIA16_Rmva-wY_HBh1gTH32ocu2SoTr

In [None]:
class LinearRegression:
    
    def __init__(self):
        ''' initializing the weights and biases parameter '''
        
        self.weights_ = None
        self.bias_ = None
    
    def fit(self,x,y):
        ''' function to train the model '''
    
        # inserting a column of 1s to represent the bias
        x = np.insert(x,0,1,axis = 1)
        #calculating the weights and bias based on mathematical formulae
        coefs = np.linalg.inv(np.dot(x.T,x)).dot(x.T).dot(y)
        self.weights_ = coefs[1:]
        self.bias_ = coefs[0]
    
    def predict(self,xtest):
        ''' Predicting the output based on weights and biases set after training '''
        return np.dot(xtest,self.weights_) + self.bias_

### Dummy Dataset

In [None]:
x,y = make_regression(n_samples=100, n_features=1, n_targets=1, noise=50, random_state=42)
plt.scatter(x,y)
plt.show()

In [None]:
sgd = SGDRegressor()
sgd.fit(x,y)
ypred_sgd = sgd.predict(x)

lr = LinearRegression()
lr.fit(x,y)
ypred_lr = lr.predict(x)

plt.figure(figsize = (20,5))
plt.subplot(1,2,1)
plt.scatter(x,y, color = 'red' )
plt.plot(x,ypred_sgd, color = 'blue',label = 'sgd')
plt.legend()
plt.subplot(1,2,2)
plt.scatter(x,y, color = 'red' )
plt.plot(x,ypred_lr, color = 'green',label = 'ols')
plt.legend()
plt.show()

<h1 align = "center" id = "LogReg"> 🖇 Logistic Regression</h1>

In [None]:
# Similar to SGDRegressor Implementation but tailored to binary classification with the help of sigmoid function
class LogisticRegression:
    
    def __init__(self,epochs=1000,lr=0.001):
        self.epochs = epochs
        self.lr = lr
    
    def fit(self,x,y):
        self.m,self.n = x.shape
        self.w = np.zeros(self.n)
        self.b = 0
        for i in range(self.epochs):
            self.update_weights(x,y)
    
    def predict_proba(self,xtest):
        return sigmoid(np.dot(xtest,self.w)+self.b)
    
    def predict(self,xtest):
        ypred = []
        for i in range(xtest.shape[0]):
            if sigmoid(np.dot(xtest[i],self.w)+self.b) >=0.5:
                ypred.append(1)
            else:
                ypred.append(0)
        return ypred
    
    def update_weights(self,x,y):
        for i in range(self.m):
            row = x[i].T
            yhat = self.predict_proba(row)
            dw = -2*np.dot(row,y[i]-yhat)
            db = - 2*np.sum(y[i]-yhat)  
            self.w = self.w - self.lr*dw
            self.b = self.b - self.lr*db

### Dummy Dataset

In [None]:
x, y = make_classification(n_samples=100, n_features=2, n_informative=1,n_redundant=0,
                           n_classes=2, n_clusters_per_class=1, random_state=42,hypercube=False,class_sep=20)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state = 1)

In [None]:
lr = LogisticRegression()
lr.fit(xtrain,ytrain)
ypred = lr.predict(xtest)

In [None]:
ypred

In [None]:
list(ytest)

<h1 align = "center" id = "KMeans"> 🔢 KMeans Clustering</h1> 

## Steps 

<ul>
<li>Randomly initialize centroids from the sample
<li>Assign Clusters based on euclidean distance from centroids
<li>Move centroids based on new mean position
</ul>

In [None]:
class KMeans:
    
    def __init__(self,n_clusters=5,max_iters=100):
        self.n_clusters = n_clusters
        self.max_iters = max_iters
        self.centroids = None
        
    def fit_predict(self,x):
        # Randomly initialize centroids
        self.centroids = x[random.sample(range(x.shape[0]),self.n_clusters)]
        for i in range(self.max_iters):
            # assigning clusters
            clusters = self.assign_clusters(x)
            prev_centroids = self.centroids
            # Moving centroids
            self.centroids = self.move_centroids(x,clusters)
            # Checking if centroids have changed 
            if (prev_centroids == self.centroids).all():
                break
        return clusters
        
    def assign_clusters(self,x):
        clusters = []
        for sample in x:
            distances = []
            for centroid in self.centroids:
                distances.append(np.sqrt(np.dot(sample-centroid,sample-centroid)))
            index = distances.index(min(distances))
            clusters.append(index)
        return np.array(clusters)
    
    def move_centroids(self,x,clusters):
        cluster_types = np.unique(clusters)
        new_centroids = []
        for cluster in cluster_types:
            new_centroids.append(x[clusters == cluster].mean(axis = 0))
        return np.array(new_centroids)
        

### Dummy Dataset

In [None]:
centroids = [(-5,-5),(5,5),(-10,-10),(0,0)]
cluster_std = [1,1,1,1]

X,y = make_blobs(n_samples=100,cluster_std=cluster_std,centers=centroids,n_features=2,random_state=42)
plt.scatter(X[:,0],X[:,1])
plt.show()

In [None]:
km = KMeans(n_clusters=4,max_iters = 1000)
y_means = km.fit_predict(X)

plt.scatter(X[y_means == 0,0],X[y_means == 0,1],color='red')
plt.scatter(X[y_means == 1,0],X[y_means == 1,1],color='blue')
plt.scatter(X[y_means == 2,0],X[y_means == 2,1],color='green')
plt.scatter(X[y_means == 3,0],X[y_means == 3,1],color='yellow')
plt.show()

<h1 align = "center" id = "KNN"> 🔪 KNN</h1>

In [None]:
class KNearestNeighbors:
    
    def __init__(self,k=2):
        self.k = k
        
    def fit(self,x,y):
        if x.shape[0]-1<self.k:
            print('Error, value of K greater than number of samples')
        else:
            self.x = x
            self.y = y
        
    def predict(self,xtest):
        if xtest.shape[0]-1<self.k:
            print('Error, value of K greater than number of samples')
        else:
            ypred = []
            for i in range(xtest.shape[0]):
                distances = {}
                counter = 0
                for j in range(self.x.shape[0]):
                    distances[counter] = np.sqrt(np.dot(xtest[i]-self.x[j],xtest[i]-self.x[j]))
                    counter = counter + 1
                distances = dict(sorted(distances.items(), key=lambda x:x[1]))
                keys = list(distances.keys())[:self.k]
                ypred.append(mode(list(self.y[keys]))[0]) 
            return ypred

### Dummy Dataset

In [None]:
x,y = make_classification(n_samples=100, n_features=5, n_classes=2, random_state=42)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
knn = KNearestNeighbors()
knn.fit(xtrain,ytrain)
ypred = knn.predict(xtest)

In [None]:
# Predicted Results
ypred

In [None]:
# Actual Results
list(ytest)

<h1 align = "center" id = "GNB"> 📨 Gaussian Naive Bayes</h1>

In [None]:
class GaussianNaiveBayes:
    
    def __init__(self):
        self.nclasses = None
        self.mean = {}
        self.var = {}
        self.prior = {}
        self.eps = 1e-5 #stability factor (saw in sklearn implementation)
    
    def fit(self,x,y):
        self.m,self.n = x.shape
        self.nclasses = len(np.unique(y))
        for clas in range(self.nclasses):
            x_c = x[y==clas]
            self.mean[str(clas)] = np.mean(x_c,axis = 0)
            self.var[str(clas)] = np.var(x_c,axis = 0)
            self.prior[str(clas)] = x_c.shape[0]/self.m
            
    
    def predict(self,xtest):
        probs = np.zeros((xtest.shape[0],self.nclasses))
        
        for clas in range(self.nclasses):
            prior = self.prior[str(clas)]
            probs_clas = self.density_function(xtest,self.mean[str(clas)],self.var[str(clas)])
            probs[:,clas] = np.log(prior) + probs_clas
        
        return np.argmax(probs,axis=1)
            
    # Gaussian density function used to find probability of a feature value (continous feature) given the class
    def density_function(self,x,mean,var):
        a = (-1/2)*(np.log(2*np.pi))*(self.n) -(1/2)*(np.sum(np.log(var+self.eps)))
        b = (-1/2)*np.sum(np.power(x-mean,2)/(var+self.eps),1)
        return a+b
    

In [None]:
x,y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=69)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
gnb = GaussianNaiveBayes()
gnb.fit(xtrain,ytrain)
ypred = gnb.predict(xtest)

In [None]:
# Predicted results
ypred

In [None]:
# Actual results
list(ytest)

<h1 align = "center" id = "DTC"> 🌴 Decision Tree Classifier</h1>

<b>For more information regarding this code checkout this playlist:</b> https://www.youtube.com/playlist?list=PLM8wYQRetTxAl5FpMIJCcJbfZjSB0IeC_

In [None]:
class Node:
    ''' Node Class ''' 

    def __init__(self,feature = None,feature_value = None,left = None,right = None,info_gain = None,value = None):
        ''' constructor '''
    
        #decision node
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.feature = feature
        self.feature_value = feature_value
        
        #leaf node
        self.value = value

class DecisionTreeClassifier:
    ''' Tree Class '''
    
    def __init__(self,min_samples_split = 2,max_depth = None):
        ''' constructor '''
    
        #initializing root of the tree
        self.tree = None
        
        #initializing stopping condition (sklearn default values used)
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def split(self,data,feature,feature_value):
        ''' split a node based on a particular feature and its threshold'''
        
        left = np.array([row for row in data if row[feature]<=feature_value])
        right = np.array([row for row in data if row[feature]>feature_value])
        
        return left,right
    
    def build_tree(self,data,depth = 0):
        ''' recursive function to build the tree'''
        
        x, y = data[:,:-1], data[:,-1]
        num_samples,num_features = np.shape(x)
        
        # split until stopping conditions are met
        if num_samples >= self.min_samples_split and depth <=self.max_depth:
            # find the best split
            best_split = self.get_best_split(data,num_features,num_samples)
            # check if information gain is positive
            if best_split['info_gain'] > 0:
                left = self.build_tree(best_split['left'],depth+1)
                right = self.build_tree(best_split['right'],depth+1)
                #return decision node
                return Node(best_split['feature'],best_split['feature_value'],left,right,best_split['info_gain'])
            
        # compute leaf node
        leaf_value = self.calculate_leaf_value(y)
        # return leaf node
        return Node(value = leaf_value)
    
    def get_best_split(self,data,num_features,num_samples):
        ''' to find the best split '''    
    
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature in range(num_features):
            feature_values = data[:,feature]
            unique_values = np.unique(feature_values)
            # loop over all the values of the feature
            for value in unique_values:
                # get current split
                left,right = self.split(data, feature, value)
                # check if childs are not null
                if len(left)>0 and len(right)>0:
                    y, left_y, right_y = data[:, -1], left[:, -1], right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y,"entropy")
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature"] = feature
                        best_split["feature_value"] = value
                        best_split["left"] = left
                        best_split["right"] = right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
                
    
    def information_gain(self, parent, l_child, r_child, mode="entropy"):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        ''' function to compute entropy '''
        
        class_labels = np.unique(y)
        entropy = 0
        for label in class_labels:
            p_class = len(y[y == label]) / len(y)
            entropy += -p_class * np.log2(p_class)
        return entropy
    
    def gini_index(self, y):
        ''' function to compute gini index '''
        
        class_labels = np.unique(y)
        gini = 0
        for label in class_labels:
            p_class = len(y[y == label]) / len(y)
            gini += p_class**2
        return 1 - gini
    
    def calculate_leaf_value(self, y):
        ''' function to compute leaf node '''
        
        y = list(y)
        return max(y, key=y.count)
    
    def fit(self,x,y):
        ''' function to train the tree '''  
    
        data = np.concatenate((x,y),axis = 1)
        self.tree = self.build_tree(data)
    
    def predict(self, X):
        ''' function to predict new dataset '''
        
        predictions = [self.make_prediction(x,self.tree) for x in X]
        return predictions
    
    def make_prediction(self, x,tree):
        ''' function to predict a single data point '''
        
        if tree.value != None: 
            return tree.value
        feature_val = x[tree.feature]
        if feature_val<=tree.feature_value:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [None]:
x,y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=45)
y = y.reshape(-1,1)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
dt = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
dt.fit(xtrain,ytrain)
ypred = dt.predict(xtest)

In [None]:
dt.predict(np.expand_dims(xtest[0],axis = 0))[0]

In [None]:
# Actual results
for i in ytest:
    print(i[0],end = ' ')

In [None]:
# Predicted results
for i in ypred:
    print(int(i),end = ' ')

<h1 align = "center" id = "RFC"> 🌳🌳 Random Forest Classifier 🌲🌲 </h1>

In [None]:
class RandomForestClassifier():
    
    def __init__(self, n_trees = 10 , min_samples_split = 2, max_depth = None,max_features = None):
        ''' Initializing the base parameters '''
        
        self.n_trees = n_trees
        self.min_samples_split =  min_samples_split
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = [self.create_tree() for _ in range(self.n_trees)]
        
    def sample(self,x, y):
        ''' Function used for boostrap sampling and random feature selection '''
        
        # Setting the max_features parameter for feature selection
        num_samples, num_features = x.shape
        if self.max_features == 'sqrt':
            self.max_features = np.sqrt(num_features)
        elif self.max_features == 'log':
            self.max_features = np.log2(num_features)
        elif self.max_features == None:
            self.max_features = num_features
            
        # Sample with replacement
        sample = np.random.choice(a=num_samples, size=num_samples, replace=True)
        # Random Feature selection
        col_sample = np.random.choice(a = num_features,size = self.max_features,replace = False)
        x = x[sample]
        return x[:,col_sample], y[sample]
        
    def create_tree(self):
        ''' Function to create the homogeneous weak learners '''
        return DecisionTreeClassifier(max_depth = self.max_depth, min_samples_split = self.min_samples_split)
    
    def fit(self, x, y):   
        ''' Function to train the multiple trees '''
        
        for tree in self.trees:
            sample_x,sample_y = self.sample(x,y)
            tree.fit(sample_x,sample_y) 
    
    def predict(self, xtest):
        ''' mode of the predictions from each tree '''
        
        predictions = []
        for row in xtest:
            predictions.append(mode([tree.predict(np.expand_dims(row,axis = 0))[0] for tree in self.trees])[0])
        return predictions

In [None]:
x,y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=60)
y = y.reshape(-1,1)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
rf = RandomForestClassifier(min_samples_split=3, max_depth=3)
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)

In [None]:
# Actual results
for i in ytest:
    print(i[0],end = ' ')

In [None]:
# Predicted results
for i in ypred:
    print(int(i),end = ' ')

Gradient Boosting and SVM maybe..