In [2]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv("possum.csv")
dataset.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [4]:
import statsmodels.formula.api as sm

In [7]:
dataset["sex"].replace(to_replace = "f", value = 0, inplace = True) 
dataset["sex"].replace(to_replace = "m", value = 1, inplace = True) 
dataset.describe

"""encoding the data"""

<bound method NDFrame.describe of      case  site    Pop  sex  age  hdlngth  skullw  totlngth  taill  footlgth  \
0       1     1    Vic    1  8.0     94.1    60.4      89.0   36.0      74.5   
1       2     1    Vic    0  6.0     92.5    57.6      91.5   36.5      72.5   
2       3     1    Vic    0  6.0     94.0    60.0      95.5   39.0      75.4   
3       4     1    Vic    0  6.0     93.2    57.1      92.0   38.0      76.1   
4       5     1    Vic    0  2.0     91.5    56.3      85.5   36.0      71.0   
..    ...   ...    ...  ...  ...      ...     ...       ...    ...       ...   
99    100     7  other    1  1.0     89.5    56.0      81.5   36.5      66.0   
100   101     7  other    1  1.0     88.6    54.7      82.5   39.0      64.4   
101   102     7  other    0  6.0     92.4    55.0      89.0   38.0      63.5   
102   103     7  other    1  4.0     91.5    55.2      82.5   36.5      62.9   
103   104     7  other    0  3.0     93.6    59.9      89.0   40.0      67.6   

     

In [8]:
dataset.fillna({"age" : dataset["age"].mean(), "footlgth": dataset["footlgth"].mean()}, inplace = True) #handling the null values

In [9]:
"""statsmodel is used for feature selection. attributes with lower p values (less than the confidence level which is usually 0.05)
are chosen."""

reg_model3 = sm.ols(formula = "totlngth~hdlngth+taill+footlgth+belly+earconch+chest+sex+eye+age",data = dataset ).fit()
print(reg_model3.summary())


                            OLS Regression Results                            
Dep. Variable:               totlngth   R-squared:                       0.748
Model:                            OLS   Adj. R-squared:                  0.724
Method:                 Least Squares   F-statistic:                     31.07
Date:                Sat, 04 Feb 2023   Prob (F-statistic):           1.81e-24
Time:                        18:26:59   Log-Likelihood:                -227.25
No. Observations:                 104   AIC:                             474.5
Df Residuals:                      94   BIC:                             501.0
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -24.3088      7.840     -3.100      0.0

In [59]:
X_df = dataset.loc[0:104,["hdlngth", "taill", "footlgth","sex"]]
Y_df = dataset.loc[0:104,["totlngth"]]

X_arr = np.array(X_df)
Y_arr = np.array(Y_df)



In [60]:
"""a function is written inorder to normalize the datast"""

def normalizer(X_arr):
    n_col = X_arr.shape[1]
    mean_arr = np.mean(X_arr, axis = 0)
    std_arr = np.std(X_arr, axis = 0)
    
    X_copy = X_arr.copy()
    X_copy = ((X_copy - mean_arr) / std_arr)
    return X_copy  

X = normalizer(X_arr)
Y = normalizer(Y_arr)

In [13]:
Y.shape

(104, 1)

In [61]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 0, train_size = 0.80)

In [63]:
X_train

array([[ 1.51769417e+00,  2.51469629e-01, -7.94705255e-01,
         8.39593969e-01],
       [ 1.67911648e-01,  5.07870036e-01,  1.75535514e+00,
        -1.19105191e+00],
       [ 3.36634464e-01,  5.07870036e-01, -5.19023050e-01,
         8.39593969e-01],
       [-7.31943368e-01, -5.17731590e-01, -1.48391077e+00,
        -1.19105191e+00],
       [-5.70521059e-02,  2.04627248e+00, -9.09572840e-01,
         8.39593969e-01],
       [ 7.58441502e-01, -5.17731590e-01,  6.98573354e-01,
         8.39593969e-01],
       [ 1.65829652e+00,  2.30267288e+00,  7.44520388e-01,
         8.39593969e-01],
       [-8.72545714e-01,  1.78987207e+00, -6.56864153e-01,
         8.39593969e-01],
       [-1.65991885e+00,  5.07870036e-01, -1.41499022e+00,
         8.39593969e-01],
       [ 1.20836901e+00,  7.64270443e-01, -1.25417560e+00,
        -1.19105191e+00],
       [ 1.67911648e-01, -1.03053240e+00, -6.56864153e-01,
         8.39593969e-01],
       [ 1.46145323e+00,  5.07870036e-01, -6.79837670e-01,
      

In [35]:
print(X_test)

[[-0.59134102 -1.0305324   0.42289115 -1.19105191]
 [-0.36637727  1.02067085 -0.67983767 -1.19105191]
 [ 0.3928754   1.02067085  1.59454052 -1.19105191]
 [-0.16953398  0.50787004 -1.00146691 -1.19105191]
 [-1.23811181  0.76427044 -1.87446056 -1.19105191]
 [ 0.78656197 -0.00493078  0.12423543  0.83959397]
 [ 0.58971869 -0.26133118  1.08912314 -1.19105191]
 [-0.87254571 -0.26133118 -0.56497008  0.83959397]
 [ 0.70220056  2.04627248 -0.28928788 -1.19105191]
 [-0.3382568  -1.0305324   0.99722908  0.83959397]
 [ 0.19603212 -0.26133118 -0.86362581  0.83959397]
 [-0.45073868  0.50787004 -1.07038746  0.83959397]
 [ 1.40521229  1.53347166 -0.12847326  0.83959397]
 [ 0.78656197 -0.51773159  1.34183183  0.83959397]
 [ 0.61783916 -0.00493078  0.97425556 -1.19105191]
 [ 0.05542977 -0.00493078 -0.10549974  0.83959397]
 [-0.02893164 -0.51773159  0.99722908 -1.19105191]
 [ 0.89904385 -0.774132    0.88236149  0.83959397]
 [-0.56322055 -0.26133118  1.06614963  0.83959397]
 [ 0.22415259 -0.00493078  0.90

In [64]:
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(mean_squared_error(Y_test, Y_pred))
print("coef = ", model.coef_)
print("intercept = ", model.intercept_)


0.3992933838427882
coef =  [[ 0.40102371  0.45312113  0.32487889 -0.10717   ]]
intercept =  [-0.03747195]


In [71]:
def gradient_descent(X_arr, Y_arr, coefs):
#     X = normalizer(X_arr)
#     Y = normalizer(Y_arr)
    
   
    n = len(Y_arr)
    L      = 0.01 #learning rate
    epochs = 1000
    
    variable_array = []
    for i in range(X.shape[1]):
        variable_array.append(X[: , i])
    
    for i in range(len(variable_array)):
        variable_array[i] = variable_array[i].reshape(len(variable_array[i]),1)
        
    variable_array = np.array(variable_array)  
    print(variable_array)
    coef_list = []
    for i in range(X.shape[1]+1):
        coef_list.append([])
    """an array to have the attributes and another array to have the coefficients are created."""
    
    Y_pred = np.zeros((len(variable_array[0]), 1))
    for i in range(len(coefs)):
        if i == len(coefs)-1:
            Y_pred += coefs[i]
        else:
            Y_pred += coefs[i]*variable_array[i] #y_pred is calculated
    
    diff_list = [1]*len(coefs)
    for i in range(epochs):
        Y_pred = np.zeros((len(variable_array[0]), 1))
        for j in range(len(coefs)):
            if j == len(coefs)-1:
                Y_pred += coefs[j]
            else:
                Y_pred += coefs[j]*variable_array[j]
        
        
        for j in range(len(coefs)):
            if j == len(coefs) - 1:
                diff_list[j] = (-1/n) * sum(Y - Y_pred)
            else:
                diff_list[j] = (-1/n) * sum(variable_array[j] * (Y - Y_pred))
#                 print(diff_list[j])
        for j in range(len(coefs)):          
            coefs[j] = coefs[j] - L*diff_list[j]
            coef_list[j].append(float(coefs[j]))
            """in the above loops, the coefficients and the intercept are updated"""
            
    print(coefs)
    return coefs, coef_list
# coefs = [ 0.49125088,  0.98024308,  0.3052852,  -1.00418384,-15.15591079 ]
coefs = [1,1,1,1,0] #coefs are initially set to 1
coefs, coef_list = gradient_descent(X_train, Y_train, coefs)
coef_list = np.array(coef_list)



[[[ 4.20995871e-01]
  [-2.89316366e-02]
  [ 3.92875402e-01]
  [ 1.67911648e-01]
  [-3.10136329e-01]
  [ 1.39791179e-01]
  [ 7.58441502e-01]
  [ 6.17839156e-01]
  [ 2.24152587e-01]
  [-2.25774921e-01]
  [ 1.96032117e-01]
  [ 6.45959625e-01]
  [ 7.02200564e-01]
  [ 7.86561972e-01]
  [ 8.35502404e-02]
  [-2.82015860e-01]
  [ 5.89718687e-01]
  [ 2.52273056e-01]
  [ 5.05357279e-01]
  [ 6.17839156e-01]
  [ 9.27164318e-01]
  [ 1.03964619e+00]
  [-2.89316366e-02]
  [ 5.05357279e-01]
  [ 8.99043849e-01]
  [ 9.55284787e-01]
  [-5.91341022e-01]
  [ 3.36634464e-01]
  [ 5.54297711e-02]
  [-1.41413514e-01]
  [ 5.54297711e-02]
  [ 4.77236810e-01]
  [-3.38256798e-01]
  [-5.63220552e-01]
  [ 5.05357279e-01]
  [ 1.96032117e-01]
  [-9.28786653e-01]
  [-5.70521059e-02]
  [-2.22232824e+00]
  [-4.50738675e-01]
  [-1.18187088e+00]
  [-2.05360542e+00]
  [-7.31943368e-01]
  [-2.10984636e+00]
  [-5.35100083e-01]
  [-3.38256798e-01]
  [-7.03822899e-01]
  [ 1.68641699e+00]
  [ 7.86561972e-01]
  [-2.82015860e-01]


[array([0.45134139]), array([0.46936671]), array([0.30882051]), array([-0.15102139]), array([5.11665676e-16])]


In [69]:
def error_calc(coefs, coef_list, X_arr, Y_arr): # this function calculates the error for every set of coefs
    X = normalizer(X_arr) 
    Y = normalizer(Y_arr)
    n = len(Y_arr)
    variable_array = []
    for i in range(X.shape[1]):
        variable_array.append(X[: , i])
    
    for i in range(len(variable_array)):
        variable_array[i] = variable_array[i].reshape(len(variable_array[i]),1)
        
    variable_array = np.array(variable_array)    
    error_list = []
    print(variable_array)
    for h in range(len(coef_list[0])):
        Y_pred = np.zeros((len(variable_array[0]), 1))
        for i in range(len(coefs)):
            if i == len(coefs)-1:
                Y_pred += coef_list[i][h]
            else:
                Y_pred += coef_list[i][h]*variable_array[i]
        
        error = (((Y - Y_pred)**2).sum())/n
        error_list.append(error)
    

    return error_list, Y_pred
        
error_list, Y_pred = error_calc(coefs, coef_list, X_train, Y_train)
print(error_list[-1])
print(coefs)

[[[ 1.45166292e+00]
  [ 1.59854773e-01]
  [ 3.21330792e-01]
  [-7.01350659e-01]
  [-5.54465847e-02]
  [ 7.25020839e-01]
  [ 1.58622627e+00]
  [-8.35914008e-01]
  [-1.58946876e+00]
  [ 1.15562356e+00]
  [ 1.59854773e-01]
  [ 1.39783758e+00]
  [-1.80477012e+00]
  [ 1.15562356e+00]
  [ 3.21330792e-01]
  [ 4.82806811e-01]
  [-3.51485952e-01]
  [-1.62124517e-03]
  [ 8.32671518e-01]
  [-2.16922603e-01]
  [ 2.67505453e-01]
  [-2.85339149e-02]
  [ 2.67505453e-01]
  [-7.01350659e-01]
  [-1.96624614e+00]
  [-2.97660613e-01]
  [-2.70747943e-01]
  [ 2.40592783e-01]
  [-1.13195338e+00]
  [-2.12772216e+00]
  [ 1.32942104e-01]
  [ 5.90457490e-01]
  [ 2.82420908e+00]
  [ 4.02068801e-01]
  [ 4.82806811e-01]
  [ 5.22040944e-02]
  [ 1.96300365e+00]
  [ 8.05758848e-01]
  [ 6.17370160e-01]
  [ 1.86767443e-01]
  [ 4.82806811e-01]
  [ 1.32942104e-01]
  [ 1.86767443e-01]
  [ 4.55894141e-01]
  [-1.83168279e+00]
  [ 5.09719480e-01]
  [-5.39874641e-01]
  [-5.12961971e-01]
  [-3.24573282e-01]
  [-2.70747943e-01]


In [70]:
def test_predictor(coefs, X_test, Y_test): # predicts the output of test data and at last ret
    Y_pred = np.zeros((len(X_test), 1))
    variable_array = []
    for i in range(X_test.shape[1]):
        variable_array.append(X_test[: , i])
    
    for i in range(len(variable_array)):
        variable_array[i] = variable_array[i].reshape(len(variable_array[i]),1)
        
    variable_array = np.array(variable_array)    
    for i in range(len(coefs)):
        if i == len(coefs)-1:
            Y_pred += float(coefs[i])
        else:
            Y_pred += float(coefs[i])*variable_array[i]
    error = (((Y_test - Y_pred)**2).sum())/len(Y_test)
    print(coefs)
    return Y_pred , error
print(test_predictor(coefs, X_test, Y_test))

[array([0.45131474]), array([0.46938541]), array([0.3088443]), array([-0.15101017]), array([5.1292705e-16])]
(array([[-0.44012931],
       [ 0.28363351],
       [ 1.32872416],
       [ 0.03243721],
       [-0.59909621],
       [ 0.26425475],
       [ 0.65971412],
       [-0.8177328 ],
       [ 1.36791994],
       [-0.45517586],
       [-0.427706  ],
       [-0.42240852],
       [ 1.18751677],
       [ 0.39960124],
       [ 0.75727971],
       [-0.13666839],
       [ 0.23177654],
       [ 0.18811056],
       [-0.17436778],
       [ 0.55831744],
       [-1.31175072]]), 0.3483715407688755)
