In [128]:
import os
import io
import pandas as pd
import numpy as np

# Predicting MPG (miles per gallon) ratings
# of cars based on
# Gallons per 100 miles, Cylinders, Displacement (100 cubic inches), Horsepower (100), Weight (1000 lbs)

df = pd.read_csv('AutoMPGModels.csv', header=0, error_bad_lines=False)

df.columns

b'Skipping line 394: expected 15 fields, saw 301\nSkipping line 395: expected 15 fields, saw 301\nSkipping line 396: expected 15 fields, saw 301\nSkipping line 397: expected 15 fields, saw 301\nSkipping line 398: expected 15 fields, saw 301\nSkipping line 399: expected 15 fields, saw 301\nSkipping line 400: expected 15 fields, saw 301\nSkipping line 401: expected 15 fields, saw 301\nSkipping line 402: expected 15 fields, saw 301\nSkipping line 404: expected 15 fields, saw 301\nSkipping line 405: expected 15 fields, saw 301\nSkipping line 415: expected 15 fields, saw 301\nSkipping line 425: expected 15 fields, saw 301\nSkipping line 435: expected 15 fields, saw 301\nSkipping line 445: expected 15 fields, saw 301\nSkipping line 455: expected 15 fields, saw 301\nSkipping line 465: expected 15 fields, saw 301\nSkipping line 476: expected 15 fields, saw 301\nSkipping line 477: expected 15 fields, saw 301\nSkipping line 478: expected 15 fields, saw 301\nSkipping line 479: expected 15 fields,

Index(['GallonsPer100Miles', 'GallonsPer100MilesTo1981', 'MPG', 'Cylinders',
       'Displacement100ci', 'Horsepower100', 'Weight1000lb', 'Seconds0to60',
       'Year', 'Year70To81', 'Origin', 'Origin.Eq.1', 'Origin.Eq.2',
       'Origin.Eq.3', 'Name'],
      dtype='object')

In [129]:
# change column indexes to numeric
# for convenience
df.columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 , 12, 13, 14]

# drop less important columns
df.drop(axis=1, columns=[1, 7, 8, 9, 10, 11, 12, 13, 14], inplace=True)

df.head()

Unnamed: 0,0,2,3,4,5,6
0,5.6,18.0,8.0,3.07,1.3,3.504
1,6.7,15.0,8.0,3.5,1.65,3.693
2,5.6,18.0,8.0,3.18,1.5,3.436
3,6.3,16.0,8.0,3.04,1.5,3.433
4,5.9,17.0,8.0,3.02,1.4,3.449


In [130]:
# rename column names for convenience
df.columns = ['x1', 'y', 'x2', 'x3', 'x4', 'x5']

df.head()

Unnamed: 0,x1,y,x2,x3,x4,x5
0,5.6,18.0,8.0,3.07,1.3,3.504
1,6.7,15.0,8.0,3.5,1.65,3.693
2,5.6,18.0,8.0,3.18,1.5,3.436
3,6.3,16.0,8.0,3.04,1.5,3.433
4,5.9,17.0,8.0,3.02,1.4,3.449


In [131]:
# drop rows with null values
df.dropna(axis=0, inplace=True)

df

Unnamed: 0,x1,y,x2,x3,x4,x5
0,5.6,18.0,8.0,3.07,1.30,3.504
1,6.7,15.0,8.0,3.50,1.65,3.693
2,5.6,18.0,8.0,3.18,1.50,3.436
3,6.3,16.0,8.0,3.04,1.50,3.433
4,5.9,17.0,8.0,3.02,1.40,3.449
...,...,...,...,...,...,...
387,3.7,27.0,4.0,1.40,0.86,2.790
388,2.3,44.0,4.0,0.97,0.52,2.130
389,3.1,32.0,4.0,1.35,0.84,2.295
390,3.6,28.0,4.0,1.20,0.79,2.625


In [132]:
def make_numeric(column_series):
    column_series = pd.to_numeric(column_series)
    return column_series

df = df.apply(make_numeric, axis=1)

df['x1']

0      5.6
1      6.7
2      5.6
3      6.3
4      5.9
      ... 
387    3.7
388    2.3
389    3.1
390    3.6
391    3.2
Name: x1, Length: 392, dtype: float64

In [133]:
thetas = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

In [134]:
count = 0
for i in range(10):
    
    print("iteration number:", count)
    count += 1
    h = sum([(thetas[0] + (thetas[i] * df['x' + str(i)])) for i in range(1, 6)])
    
    print('h: ', h)
    
    sq_error = np.sum((h - df['y']) ** 2)

    m = len(df)

    J = (1/(2*m))*sq_error

    print('J: ', J)


    # Gradinet Descent

    alpha = 0.01
    diff_J_theta0 = 0
    diff_J_theta1 = 0
    diff_J_theta2 = 0
    diff_J_theta3 = 0
    diff_J_theta4 = 0
    diff_J_theta5 = 0

    diff_J_theta0 = np.sum(h - df['y'])
    diff_J_theta1 = np.sum((h - df['y']) * df['x1'])
    diff_J_theta2 = np.sum((h - df['y']) * df['x2'])
    diff_J_theta3 = np.sum((h - df['y']) * df['x3'])
    diff_J_theta4 = np.sum((h - df['y']) * df['x4'])
    diff_J_theta5 = np.sum((h - df['y']) * df['x5'])

    thetas[0] = thetas[0] - (alpha/m) * diff_J_theta0 
    thetas[1] = thetas[1] - (alpha/m) * diff_J_theta1
    thetas[2] = thetas[2] - (alpha/m) * diff_J_theta1
    thetas[3] = thetas[3] - (alpha/m) * diff_J_theta1
    thetas[4] = thetas[4] - (alpha/m) * diff_J_theta1
    thetas[5] = thetas[5] - (alpha/m) * diff_J_theta1

    print ('thetas: ', thetas[0],thetas[1], thetas[2], thetas[3], thetas[4], thetas[5])

iteration number: 0
h:  0      13.2370
1      14.2715
2      13.3580
3      13.6365
4      13.3845
        ...   
387     8.8750
388     7.4600
389     8.2925
390     8.6075
391     8.4650
Length: 392, dtype: float64
J:  134.81848304017856
thetas:  0.6283440306122449 0.9500493596938775 0.9500493596938775 0.9500493596938775 0.9500493596938775 0.9500493596938775
iteration number: 1
h:  0      23.543080
1      25.508732
2      23.772992
3      24.302170
4      23.823345
         ...    
387    15.254849
388    12.566210
389    14.148042
390    14.746573
391    14.475809
Length: 392, dtype: float64
J:  90.50204503956822
thetas:  0.6772592143005276 0.9819336754779298 0.9819336754779298 0.9819336754779298 0.9819336754779298 0.9819336754779298
iteration number: 2
h:  0      24.472340
1      26.503961
2      24.709968
3      25.256905
4      24.762010
         ...    
387    15.905950
388    13.127078
389    14.761998
390    15.380616
391    15.100765
Length: 392, dtype: float64
J:  89.1366390