### Data filtering & cleaning

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Gradient Descent/insurance.csv")

In [3]:
data.shape

(1338, 7)

In [4]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [6]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
data['gender'] = "temp" # creating a new column "gender" & inserting temp in each row
def func(s):
    if s == "male":
        return 0
    else:
        return 1

data['gender'] = data.sex.apply(func) 

In [8]:
data['somke'] = 0 # creating a new column "gender" & inserting temp in each row
# Considering if (he/she) smokes = 1, if NOT smoke = 0
def function(s):
    if s == "yes":
        return 1
    else:
        return 0

data['smoke'] = data.smoker.apply(function) 

In [9]:
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [10]:
data['location'] = 0 # creating column location with all 0's
# Let's consider southwest = 1, southeast = 2, northwest = 3, northeast = 4
def convert(s):
    if s == "southwest":
        return 1
    elif s == "southeast":
        return 2
    elif s == "northwest":
        return 3
    else:
        return 4

data['location'] = data.region.apply(convert) 

In [11]:
data.drop('sex', axis = 1, inplace = True)
data.drop('region', axis = 1, inplace = True)
data.drop('smoker', axis = 1, inplace = True)
data.head()

Unnamed: 0,age,bmi,children,charges,gender,somke,smoke,location
0,19,27.9,0,16884.924,1,0,1,1
1,18,33.77,1,1725.5523,0,0,0,2
2,28,33.0,3,4449.462,0,0,0,2
3,33,22.705,0,21984.47061,0,0,0,3
4,32,28.88,0,3866.8552,0,0,0,3


#### For n - features linear regression will be "Y = m0 * x0 + m1 * x1 + ..... + mn * xn"  considering x0 as dummy feature => x0 = 1, (therefore m0 * x0) will work as an intercept. for n - features we have to find m0, m1,..., mn.  For this dataset we're goining to use multiple regression model
#### Here Y(output value) will be "charges" & other attributes will be feature's

#### CREATING OWN GRADIENT DESCENT FUNCTION

In [12]:
# update m & c at each gradient step -----> Try'in finding optimal
""" cost = (1/M) * sum((Y_actual - Y_pred)**2) where Y_pred = m0*x0 + m1*x1 +...+ mn*xn
Here M = no. of training example & N = no. of features,  m_slope: d(cost)/d(m)  &   c_slope: d(cost)/d(c)
m_slope: (-2/M) * sum(Y_actual - Y_pred)*xi, here m_slope have slope of all features & intercept too"""

def step_gradient(data, learning_rate, m):
    N = len(data.columns); M = len(data)
    m_slope = np.zeros(N); 
    for i in range(M):            # For each training set
        y = data.iat[i, 3]
        for j in range(N):        # For each features
            x = data.iat[i, j] 
            if (j == 3):
                m_slope[j] += (-2/M)*(y - m[3]*x)
                continue
            m_slope[j] += (-2/M)*(y - m[j]*x)*x 
            
    new_m = m - learning_rate * m_slope
    return new_m

In [13]:
# Cost function to verify wheather given learning_rate, iteration is good enough for data

def cost(data, m):
    total_cost = 0; M = len(data); N = len(data.columns)
    for i in range(M):
        y = data.iat[i, 3]
        for j in range(N):
            x = data.iat[i, j]
            total_cost += (1/M)*((y - m[j]*x - m[3])**2) # here m[3] is used as intercept
    return total_cost

In [14]:
# It will find optimal m (& update) then print it

def own_gradient_descent(data, learning_rate, iteration) :
    N = len(data.columns) # no. of features
    m = np.zeros(N) # array of intercept & slope
    for i in range(iteration):
        m = step_gradient(data, learning_rate, m)
        print(i, "Cost : ", cost(data, m))
    print(m) # updated m & c after all gradient descent steps taken

# If cost is looks constant after certain iteration ==> We're near about optimal we update 
# the learning rate & iteration(or no. of descent steps) in order to come close to least cost

In [21]:
own_gradient_descent(data, 0.00007, 10)

0 Cost :  2368692825.3078732
1 Cost :  2225287678.9813128
2 Cost :  2126454644.276239
3 Cost :  2057967066.7724466
4 Cost :  2009502888.0879629
5 Cost :  1975215686.9315813
6 Cost :  1950422433.8215425
7 Cost :  1932606526.5799863
8 Cost :  1919485444.6761715
9 Cost :  1909939843.650699
[308.89088088 332.28159548  21.69447828   0.78414619   8.70392762
   0.           9.1875059   46.0562146 ]
