## Ordinary Least Squares in just NumPy 

For learning purposes. 

In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
def preprocess(df):
    #drop id
    df.drop('id',inplace=True,axis=1)
    #add constant
    df['const'] = 1
    
    if 'target' in df.columns:
        y = df['target'].values
        df.drop('target',inplace=True,axis=1)
    else:
        y = None
    
    X = df.values 
    
    return X, y


In [None]:
class OLS():    
    """"My own Ordinary Least Squares Class"""    
    def __init__(self):
        return
    
    def fit(self,X,y):
        #find least squares solution and store the weights in W
        self.X = X
        self.y = y
        self.W = np.linalg.lstsq(X,y,rcond=None)[0]
        return 

    def predict(self,X):
        # matrix multiply X with the transposed weights
        return X @ self.W.T    
    
    def score(self):
        #calculate the root mean squared error
        y_hat = self.predict(self.X)
        print(f'\nRoot mean squared error: {np.sqrt(np.mean(((self.y-y_hat)**2)))}')
        return
    
    def summary(self,names):
        print('OLS Summary\n\nVariable\tCoefficient')
        for n,w in zip(names, self.W):
            print(f'{n}\t{w}' )
        self.score()
        return

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')

#sanity check
assert np.all(test_df['id'].values ==  sub['id'].values)

X_train, y_train = preprocess(train_df)
X_test, y_test = preprocess(test_df)

In [None]:
ols = OLS()
ols.fit(X_train,y_train)
ols.summary(train_df.columns)

## Let's compare it with the sklearn LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


clf = LinearRegression(fit_intercept=False)
clf.fit(X_train,y_train)

print('OLS Summary\n\nVariable\tCoefficient')
for n,w in zip(train_df.columns,clf.coef_):
    print(f'{n}\t{w}' )

print(f'\nRoot mean squared error: {np.sqrt(mean_squared_error(clf.predict(X_train),y_train))}')

Great! The solution is the same! 😌

Lets submit our predictions

In [None]:
sub['target'] = ols.predict(X_test)

sub.to_csv('submission.csv',index=False)