In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

In [2]:
def  cal_cost_mse(theta,X,y):
    m = len(y)
    predictions = X.dot(theta)
    cost = (1/2*m) * np.sum(np.square(predictions-y))
    return cost

In [3]:
def  cal_cost_mae(theta,X,y):
    predictions = X.dot(theta)
    cost = mean_absolute_error(y, predictions)
    return cost

In [4]:
def  cal_cost_mape(theta,X,y):
    predictions = X.dot(theta)
    cost =  np.mean(np.abs((y - predictions) / y)) * 100
    return cost

In [5]:
def gradient_descent(X,y,theta,learning_rate,iterations,typ):
    m = len(y)
    cost_history = np.zeros(iterations)
    theta_history = np.zeros((iterations,9))
    for it in range(iterations):
        if typ == 1:
            prediction = np.dot(X,theta)
            theta = theta -(1/m)*learning_rate*( X.T.dot((prediction - y)))
            cost_history[it]  = cal_cost_mse(theta,X,y)
            theta_history[it,:] =theta.T
        elif typ == 2:
            prediction = np.dot(X,theta)
            theta = theta -(1/m)*learning_rate*( X.T)
            cost_history[it]  = cal_cost_mae(theta,X,y)
            theta_history[it,:] =theta.T
        else:
            prediction = np.dot(X,theta)
            theta = theta -(1/m)*learning_rate*( X.T.dot((prediction - y)))
            cost_history[it]  = cal_cost_mape(theta,X,y)
            theta_history[it,:] =theta.T
    return theta, cost_history, theta_history

In [6]:
data = pd.read_csv("~/Desktop/assgn2_test_data/weather_test.csv", delimiter=',')

data = data.dropna()
summary_unique = data['Summary'].unique()
data['Summary'] = data['Summary'].astype('category')
data['Summary'] = data['Summary'].cat.reorder_categories(summary_unique, ordered=True)
data['Summary'] = data['Summary'].cat.codes

precip_unique = data['Precip Type'].unique()
data['Precip Type'] = data['Precip Type'].astype('category')
data['Precip Type'] = data['Precip Type'].cat.reorder_categories(precip_unique, ordered=True)
data['Precip Type'] = data['Precip Type'].cat.codes

daily_unique = data['Daily Summary'].unique()
data['Daily Summary'] = data['Daily Summary'].astype('category')
data['Daily Summary'] = data['Daily Summary'].cat.reorder_categories(daily_unique, ordered=True)
data['Daily Summary'] = data['Daily Summary'].cat.codes


min_max_scaler = preprocessing.MinMaxScaler()

data = data.drop('Formatted Date', axis=1)
x = data.loc[:, data.columns != 'Apparent Temperature (C)']
x_data = x.values
x_scaled = min_max_scaler.fit_transform(x_data)
x1 = pd.DataFrame(x_scaled)

y = data.loc[:,'Apparent Temperature (C)']
y_data = y.values.reshape((y.shape[0],1))
y_scaled = min_max_scaler.fit_transform(y_data)
y1 = pd.DataFrame(y_scaled)

In [7]:
lr = 0.1
n_iter = 10000
theta = np.random.randn(x.shape[1],1)

In [8]:
theta1,cost_history1,theta_history1 = gradient_descent(x1,y1,theta,lr,n_iter,1)
y_pred1 = x1.dot(theta1)
r2_score(y1, y_pred1)

0.9897090472218933

### Use Mean absolute error when you are doing regression and don’t want outliers to play a big role. It can also be useful if you know that your distribution is multimodal, and it’s desirable to have predictions at one of the modes, rather than at the mean of them.

behaviour of the regression coefficients

In [10]:
theta1

Unnamed: 0,0
0,0.020547
1,-0.006345
2,1.026893
3,0.013268
4,-0.082841
5,0.003432
6,-9.7e-05
7,0.004202
8,-0.004725
