In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression


In [157]:
df=pd.read_csv("bike_data_clean.csv")
df.head()

Unnamed: 0,Bike_count,Temp,Humidity,Dew_pt_temp,Radiation,Rain,Snow
0,449,1.7,23,-17.2,1.11,0.0,0.0
1,479,4.3,41,-7.8,1.09,0.0,0.0
2,333,5.8,85,3.4,0.43,0.0,0.0
3,393,-0.3,38,-12.9,1.11,0.0,0.0
4,321,-2.3,25,-19.7,0.0,0.0,0.0


Manual implementation of gradient descent regression method

In [186]:
def gradient_descent(X, y, learning_rate=0.01, n_iter=1000):    #return the coeff of the function that modelize the target and the loss history
  n = len(y)

  X = np.array(X)
  y = np.array(y)

  X_bias = np.column_stack([np.ones(n), X]) #each row represent the xj of the cost function formula
  n_features = X_bias.shape[1] #number of xj
  theta = np.zeros(n_features) #random initialisation of the coefficients
  loss= []#historic of the cost fonction
  y_pred=np.zeros(n)

  for _ in range(n_iter):
    for i in range(n):
      y_pred[i] = np.dot(X_bias[i, :], theta) #matricial product of the i row of X and theta

    for j in range(n_features):
      grad=0

      for i in range(n):
        grad += (y_pred[i] - y[i])*X_bias[i,j] #gradient calculation

      grad = grad/n
      theta[j] -= learning_rate*grad #update od theta[j]
    loss.append((1/n)*(1/2)*np.sum((y_pred-y)**2)) #store the value of the cost function


  return theta,loss

def y_pred(X, theta):     #compute the prediction corespondant to the X features
  return np.dot(X, theta)

def R_square_coeff(y,y_pred):    #compute the R-score
  SS_res = np.sum((y - y_pred)**2)  #residue
  SS_tot = np.sum((y - np.mean(y))**2)
  R2 = 1 - (SS_res / SS_tot)
  return R2

def split_data(df,target_name): # take the whole df and target's name and return the split data in an adapted form for the gradient_descent function
  df=copy.deepcopy(df)

  train, val=np.split(df.sample(frac=1),[int(0.8*len(df))]) #split at 0.6 of the df lenght and 0.8 of the df lenght

  df_1=train.drop([target_name],axis=1)
  X_train = df_1.values.tolist()         #list of list of the features for train
  y_train = train[target_name].tolist()   #list of target for train

  df_2=val.drop([target_name],axis=1)
  X_val = df_2.values.tolist()          #list of list of the features for val
  y_val = val[target_name].tolist()    #list of target for val

  #df_3=test.drop([target_name],axis=1)
  #X_test = df_3.values.tolist()          #list of list of the features for test
  #y_test = test[target_name].tolist()    #list of target for test

  return X_train,y_train,X_val,y_val

def scale_features(X):    #normalize features to avoid the gradient to diverge
    X = np.array(X)
    means = X.mean(axis=0)
    stds = X.std(axis=0)
    return (X - means) / stds


def Linear_Regression(df,target_name,learning_rate,n_iter):      #return the coeff of the function,the loss and the R-score
  X_train_1,y_train_1,X_val_1,y_val_1 = split_data(df,target_name)
  X_train,y_train,X_val,y_val = scale_features(X_train_1),y_train_1,scale_features(X_val_1),y_val_1

  theta=[]
  loss=[]
  theta,loss = gradient_descent(X_train, y_train, learning_rate, n_iter)

  X_val_np = np.array(X_val)
  X_val_bias = np.column_stack([np.ones(len(X_val_np)), X_val_np])
  y_val_pred = y_pred(X_val_bias, theta)
  y_val_pred=y_pred(np.column_stack([np.ones(len(X_val)),X_val]),theta)

  R2=R_square_coeff(y_val,y_val_pred)
  return theta,loss,R2







In [195]:
theta,loss,R2 = Linear_Regression(df,"Bike_count",0.01,10000)
print(theta)
print(R2)




  return bound(*args, **kwds)


[ 704.22260274   72.7740879  -107.79378203  147.8710137   127.20637334
  -29.7992401    -7.17157979]
0.4287195514023857


In [190]:
df_5=df.drop(["Humidity","Dew_pt_temp","Radiation","Rain","Snow"],axis=1)
df_5.head()

Unnamed: 0,Bike_count,Temp
0,449,1.7
1,479,4.3
2,333,5.8
3,393,-0.3
4,321,-2.3


In [194]:
theta,loss,R2 = Linear_Regression(df_5,"Bike_count",0.01,15000)
print(theta)
print(R2)

  return bound(*args, **kwds)


[701.64041096 236.04321425]
0.4423426624905873
