In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
columns = ['gender','length','diameter','height','whole weight','shucked weight','viscera weight','shell weight','rings']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',names=columns)
target = 'rings'

In [3]:
def normalise_(data_):
    desc = data_.describe()
    for col in desc.columns.unique():
        xmean = desc[col][1]
        sigma = desc[col][2]
        data_[col] = (data_[col]-xmean)/sigma
    return data_

In [4]:
def kfoldsplit(k,data,target):
    
    splits = []
    
    y = pd.DataFrame(data[target])
    x = data.drop(target,axis=1)
    d = []
    s = int(len(data)/k)
    
    for i in range(k):
        a = s*i
        b = a+s
        d.append([a,b])
    for [a,b] in d:
        
        
        x_test = x[a:b]
        y_test = y[a:b]
        x_train = pd.concat([x[:a],x[b:]])
        y_train = pd.concat([y[:a],y[b:]])
        splits.append([x_train,y_train,x_test,y_test])
    
    return splits

In [5]:
from sklearn.linear_model import LinearRegression
import numpy as np
class Regression(object):
    """docstring for Regression."""
    def __init__(self, arg):
        super(Regression, self).__init__()
        self.arg = arg


    """You can give any required inputs to the fit()"""
    def fit(self,arg):
        reg_clf = LinearRegression().fit(arg[0], arg[1])
        self.model = reg_clf
        return reg_clf

        """Here you can use the fit() from the LinearRegression of sklearn"""


    """ You can add as many methods according to your requirements, but training must be using fit(), and testing must be with predict()"""


    def predict(self,X_test):
        """ Write it from scratch usig oitcomes of fit()"""
        reg_clf = self.model
        """Fill your code here. predict() should only take X_test and return predictions."""
        y_predicted = np.dot(X_test,reg_clf.coef_.T) + self.model.intercept_
        for i in range(len(y_predicted )):
            y_predicted[i]
            
        return y_predicted



In [6]:
def mse(yhat,y):
    err = 0
    for i in range(len(y)):
        err = err + (yhat[i]-y[i])**2
    return err/len(y)

In [7]:


MSE = []
S = kfoldsplit(5,df,target)
for s in range(len(S)):
    xtrain,ytrain,xtest,ytest = S[s]
    xtr = normalise_(xtrain)
    xts = normalise_(xtest)
    ytrain = normalise_(ytrain)
    ytest = normalise_(ytest)
    # xtr = xtrain
    # xts = xtest

    xtr = pd.get_dummies(xtr,columns=['gender'])
    xts = pd.get_dummies(xts,columns=['gender'])

    # xtr['gender'] = xtr['gender'].replace(['M','F','I'],[0,1,2])
    # xts['gender'] = xts['gender'].replace(['M','F','I'],[0,1,2])

    r = Regression([xtr,ytrain])
    clf = r.fit([xtr,ytrain])
    joblib.dump(r,f"savedmodels/q1fold{s}.pkl")

    yhat=r.predict(xts)
    ypred = []
    for i in yhat:
        ypred.append(i[0])

    yhtrain = r.predict(xtr)
    ypredtrain = []
    for i in yhtrain:
        ypredtrain.append(i[0])
    
    MSE.append([
    mse(ypred,list(ytest[target])),
    mean_squared_error(list(ytest[target]),ypred),
    mse(ypredtrain,list(ytrain[target])),
    mean_squared_error(list(ytrain[target]),ypredtrain)
    ])
    

    
MSE = pd.DataFrame(MSE,columns=['mse_validation_implemented','mse_validation_inbuilt','mse_train_implemented','mse_train_inbuilt'])
MSE

Unnamed: 0,mse_validation_implemented,mse_validation_inbuilt,mse_train_implemented,mse_train_inbuilt
0,0.483463,0.483463,0.468164,0.468164
1,0.423525,0.423525,0.462067,0.462067
2,0.493026,0.493026,0.459132,0.459132
3,0.447839,0.447839,0.462163,0.462163
4,0.520417,0.520417,0.450396,0.450396


In [8]:
MSE['mse_validation_implemented'].mean()

0.47365425663099475

In [9]:
MSE['mse_train_implemented'].mean()

0.4603845025042002

In [10]:

MSEjoblib = []
S = kfoldsplit(5,df,target)
for s in range(len(S)):
    xtrain,ytrain,xtest,ytest = S[s]
    xtr = normalise_(xtrain)
    xts = normalise_(xtest)
    ytrain = normalise_(ytrain)
    ytest = normalise_(ytest)
    # xtr = xtrain
    # xts = xtest

    xtr = pd.get_dummies(xtr,columns=['gender'])
    xts = pd.get_dummies(xts,columns=['gender'])

    # xtr['gender'] = xtr['gender'].replace(['M','F','I'],[0,1,2])
    # xts['gender'] = xts['gender'].replace(['M','F','I'],[0,1,2])

    r = joblib.load(f"savedmodels/q1fold{s}.pkl")
#     r.fit([xtr,ytrain])
#     joblib.dump(r,f"savedmodels/q1fold{s}.pkl")

    yhat=r.predict(xts)
    ypred = []
    for i in yhat:
        ypred.append(i[0])

    yhtrain = r.predict(xtr)
    ypredtrain = []
    for i in yhtrain:
        ypredtrain.append(i[0])
    
    MSEjoblib.append([
    mse(ypred,list(ytest[target])),
    mean_squared_error(list(ytest[target]),ypred),
    mse(ypredtrain,list(ytrain[target])),
    mean_squared_error(list(ytrain[target]),ypredtrain)
    ])

In [11]:
MSEjoblib = pd.DataFrame(MSEjoblib,columns=['mse_validation_implemented','mse_validation_inbuilt','mse_train_implemented','mse_train_inbuilt'])

In [12]:
MSEjoblib

Unnamed: 0,mse_validation_implemented,mse_validation_inbuilt,mse_train_implemented,mse_train_inbuilt
0,0.483463,0.483463,0.468164,0.468164
1,0.423525,0.423525,0.462067,0.462067
2,0.493026,0.493026,0.459132,0.459132
3,0.447839,0.447839,0.462163,0.462163
4,0.520417,0.520417,0.450396,0.450396


In [13]:

# !git add .
# !git commit -m "question one final part left, q2 main part done regularization left"
# !git push -u origin main

# !git status

In [14]:
r = Regression([xtr,ytrain])
clf = r.fit([xtr,ytrain])
theta = clf.coef_.T

In [15]:

MSEnormal = []
S = kfoldsplit(5,df,target)
for s in range(len(S)):
    xtrain,ytrain,xtest,ytest = S[s]
    xtr = normalise_(xtrain)
    xts = normalise_(xtest)
    ytrain = normalise_(ytrain)
    ytest = normalise_(ytest)
    # xtr = xtrain
    # xts = xtest

    xtr = pd.get_dummies(xtr,columns=['gender'])
    xts = pd.get_dummies(xts,columns=['gender'])

    # xtr['gender'] = xtr['gender'].replace(['M','F','I'],[0,1,2])
    # xts['gender'] = xts['gender'].replace(['M','F','I'],[0,1,2])

    xtrain,ytrain,xtest,ytest = np.array(xtr),np.array(ytrain),np.array(xts),np.array(ytest)
    ytesthat = np.dot(xtest,theta)
    ytrainhat = np.dot(xtrain,theta)

    
    MSEnormal.append([
    float(mse(ytesthat,ytest)),
    mean_squared_error(ytest,ytesthat),
    float(mse(ytrainhat,ytrain)),
    mean_squared_error(ytrain,ytrainhat)
    ])

In [16]:
MSEnormal = pd.DataFrame(MSEnormal,columns=['mse_validation_implemented','mse_validation_inbuilt','mse_train_implemented','mse_train_inbuilt'])

In [17]:
MSEnormal

Unnamed: 0,mse_validation_implemented,mse_validation_inbuilt,mse_train_implemented,mse_train_inbuilt
0,0.48079,0.48079,0.4707,0.4707
1,0.422465,0.422465,0.462743,0.462743
2,0.470587,0.470587,0.462571,0.462571
3,0.442169,0.442169,0.463058,0.463058
4,0.520284,0.520284,0.450433,0.450433


USING SKLEARN PREDICT

In [18]:
target = 'rings'
MSEsk = []
S = kfoldsplit(5,df,target)
for s in range(len(S)):
    xtrain,ytrain,xtest,ytest = S[s]
    xtr = normalise_(xtrain)
    xts = normalise_(xtest)
    ytrain = normalise_(ytrain)
    ytest = normalise_(ytest)
    # xtr = xtrain
    # xts = xtest

    xtr = pd.get_dummies(xtr,columns=['gender'])
    xts = pd.get_dummies(xts,columns=['gender'])

    # xtr['gender'] = xtr['gender'].replace(['M','F','I'],[0,1,2])
    # xts['gender'] = xts['gender'].replace(['M','F','I'],[0,1,2])

    r = Regression([xtr,ytrain])
    clf=r.fit([xtr,ytrain])
    joblib.dump(r,f"savedmodels/q1fold{s}sklearn.pkl")

    yhat=clf.predict(xts)
    ypred = []
    for i in yhat:
        ypred.append(i[0])

    yhtrain = clf.predict(xtr)
    ypredtrain = []
    for i in yhtrain:
        ypredtrain.append(i[0])
    
    MSEsk.append([
    mse(ypred,list(ytest[target])),
    mean_squared_error(list(ytest[target]),ypred),
    mse(ypredtrain,list(ytrain[target])),
    mean_squared_error(list(ytrain[target]),ypredtrain)
    ])
    

    

In [19]:
MSEsk = pd.DataFrame(MSEsk,columns=['mse_validation_implemented','mse_validation_inbuilt','mse_train_implemented','mse_train_inbuilt'])

In [20]:
MSEsk

Unnamed: 0,mse_validation_implemented,mse_validation_inbuilt,mse_train_implemented,mse_train_inbuilt
0,0.483463,0.483463,0.468164,0.468164
1,0.423525,0.423525,0.462067,0.462067
2,0.493026,0.493026,0.459132,0.459132
3,0.447839,0.447839,0.462163,0.462163
4,0.520417,0.520417,0.450396,0.450396
