In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("BWGHT.csv")
df = df.dropna()
y = df['bwght'].values.reshape(-1,1)
x = df.drop(columns="bwght")

In [3]:
ones = np.ones(x.shape[0])
x.insert(0,"beta0",ones)

In [4]:
class ml:
    def __init__(self,x,y):
        self.x = x
        self.y = y
        self.xx =np.dot(x.T,x)
        self.xy =np.dot(x.T,y)
        self.xxi = np.linalg.inv(self.xx)
        self.n,self.r = x.shape
        
    def tidy(self):
        self.tstat = np.divide(self.beta,self.beta.std())
        self.pval = 2*norm.cdf(-abs(self.tstat))
        names = ['Beta','t-Stat','p-value']
        value = [self.beta,self.tstat,self.pval]
        value = np.hstack(value)
        return pd.DataFrame(value,columns = names)
    def model(self):
        e = y-x.dot(self.beta)
        self.rsq = 1-e.T.dot(e)/(y.T.dot(y))
        self.rsq = self.rsq.values
        
        self.adjrsq = 1-(1-self.rsq)*(self.n-1)/(self.n-self.r)
        self.logl = -self.n/2*(np.log(2*np.pi*e.var())+1)
        self.logl =self.logl.values
        self.aic = 2*self.r-2*self.logl
        self.bic = np.log(self.n)*self.r-2*self.logl
        df = pd.DataFrame(columns=['r.squared','adj.rsq','r','logl','aic','bic'])
        df.loc[0] = [self.rsq,self.adjrsq,self.r,self.logl,self.aic,self.bic]
        return  df
    def MSE(self):
        self.error = y - x.dot(self.beta)
        return np.dot(self.error.T,self.error)/len(y)
    ##linear regression
    def OLS(self):
        self.beta = np.dot(self.xxi,self.xy).reshape(-1,1)
    def Ridge(self,lumda):
        self.lumda = lumda
        I = np.identity(self.x.shape[1])
        lumda_I = self.lumda *I
        self.beta = np.linalg.inv(self.xx+lumda_I).dot(self.xy)
    def GradientDescent(self,alpha,threshold, iterations):
        beta = np.ones(x.shape[1])
        beta = beta.reshape(-1,1)

        error = self.y-self.x.dot(beta)
        cost = error.T.dot(error)/len(self.y)
        cost = cost.values
        record = []
        for step in range(iterations):
            beta = beta - alpha/len(self.y)*x.T.dot(error)
            error_new = self.y-self.x.dot(beta)
            cost_new = error_new.T.dot(error_new)/len(self.y)
            cost_new = cost_new.values
            record.append([step,cost_new])
            if cost - cost_new<= threshold: break ## at this moment cost function converges
            if cost_new == float('inf'):break
            error = self.y-self.x.dot(beta)
            cost = cost_new
        self.beta = beta
        self.record = record
df = ml(x,y)


In [5]:
df.OLS()
df.tidy()

Unnamed: 0,Beta,t-Stat,p-value
0,-690.5668,-2.978294,0.002899
1,-0.8981155,-0.003873416,0.996909
2,-6.358922,-0.02742492,0.978121
3,5.786678,0.02495693,0.980089
4,3.728406,0.01607997,0.987171
5,6.463838,0.02787741,0.97776
6,-11.44597,-0.04936448,0.960629
7,-15.97846,-0.06891232,0.945059
8,-37.20871,-0.1604747,0.872507
9,-32.0,-0.1380104,0.890232


In [6]:
df.model()

Unnamed: 0,r.squared,adj.rsq,r,logl,aic,bic
0,[[0.7807148759864913]],[[0.778292865270964]],14,[-6403.682044575601],[12835.364089151202],[12906.519769122177]


In [7]:
lumda = 10
df.Ridge(lumda)
df.tidy()

Unnamed: 0,Beta,t-Stat,p-value
0,-0.365893,-0.090033,0.928261
1,0.002373,0.000584,0.999534
2,0.016805,0.004135,0.996701
3,-0.014459,-0.003558,0.997161
4,0.000151,3.7e-05,0.99997
5,-0.009567,-0.002354,0.998122
6,0.00105,0.000258,0.999794
7,0.018623,0.004583,0.996344
8,-0.002533,-0.000623,0.999503
9,-0.005016,-0.001234,0.999015


In [8]:
df.model()

Unnamed: 0,r.squared,adj.rsq,r,logl,aic,bic
0,[[0.9999984783656459]],[[0.9999984615591493]],14,[573.0642203003068],[-1118.1284406006137],[-1046.9727606296394]


In [9]:
alpha = 3*10e-10
threshold = 0.1
iterations = 100
df.GradientDescent(alpha,threshold,iterations)
df.tidy()

Unnamed: 0,Beta,t-Stat,p-value
0,1.0,86735.123807,0.0
1,1.000012,86736.114561,0.0
2,1.000007,86735.694354,0.0
3,1.000044,86738.909576,0.0
4,1.000004,86735.483197,0.0
5,1.000004,86735.479918,0.0
6,1.000001,86735.141103,0.0
7,1.0,86735.109729,0.0
8,1.0,86735.119769,0.0
9,1.000001,86735.152971,0.0


In [10]:
df.model()

Unnamed: 0,r.squared,adj.rsq,r,logl,aic,bic
0,[[0.0917919713722043]],[[0.08176078668897468]],14,[-5858.209179983503],[11744.418359967007],[11815.574039937981]


In [11]:
df.record

[[0, array([[13344.03138122]])]]