In [1]:
# Import packages
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
%matplotlib inline

In [2]:
url = 'https://raw.githubusercontent.com/uoe-iaml/DL-S2-2021-CW1/main/datasets/Melbourne_housing.csv'
aushouse = pd.read_csv(url, delimiter=',')
aushouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom2       26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

In [3]:
# Transform these values to a one-hot-encoding
aushouse = pd.get_dummies(aushouse, columns=['Type'])

# Remove features that cannot use and 'YearBuilt'
not_use = [column for column in aushouse.columns if aushouse[column].dtype == 'O']
aushouse.drop(not_use+['YearBuilt'], axis=1, inplace=True)
aushouse_new = aushouse.copy(deep=True)

# Check the features
aushouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          34857 non-null  int64  
 1   Price          27247 non-null  float64
 2   Distance       34856 non-null  float64
 3   Postcode       34856 non-null  float64
 4   Bedroom2       26640 non-null  float64
 5   Bathroom       26631 non-null  float64
 6   Car            26129 non-null  float64
 7   Landsize       23047 non-null  float64
 8   BuildingArea   13742 non-null  float64
 9   Lattitude      26881 non-null  float64
 10  Longtitude     26881 non-null  float64
 11  Propertycount  34854 non-null  float64
 12  Type_h         34857 non-null  uint8  
 13  Type_t         34857 non-null  uint8  
 14  Type_u         34857 non-null  uint8  
dtypes: float64(11), int64(1), uint8(3)
memory usage: 3.3 MB


In [4]:
# Remove 'BuildingArea' feature to increase our instances
aushouse_new.drop(['Postcode', 'Landsize', 'Propertycount', 'Bedroom2', 'BuildingArea'], axis=1, inplace=True)
# Remove any instances with missing attribute values
aushouse_new.dropna(how='any', inplace=True)
# Create X and y
X = aushouse_new.drop(['Price'], axis=1)
y = aushouse_new['Price']
print('Total', X.shape, y.shape)

Total (20401, 9) (20401,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)
lm1 = LinearRegression(normalize=True).fit(X_train,y_train)
lm1.coef_

array([ 1.70230631e+05, -4.52257303e+04,  2.21976396e+05,  4.23398902e+04,
       -1.59996288e+06,  8.84131871e+05, -1.31065268e+19, -1.31065268e+19,
       -1.31065268e+19])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)
lm2 = LinearRegression(normalize=False).fit(X_train,y_train)
lm2.coef_

array([  173155.9318304 ,   -45180.27190882,   223058.60848251,
          41075.21608809, -1624574.71825128,   877765.00418618,
         266488.8675945 ,   -61251.30273618,  -205237.56485831])

In [7]:
from scipy.optimize import nnls
nnls(X_train, y_train)

(array([115779.75638113,      0.        , 270540.33530682,      0.        ,
             0.        ,    382.74309985, 317130.09440096,  72029.03998084,
             0.        ]),
 70928549.75450137)

In [8]:
def normalize_self(X, y):
    #Steps for 'normalize' in the linear regression:
    # 1. minus column average value:
    X_offset = np.average(X, axis=0)
    X -= X_offset
    # 2. use the 'normalize' function to normalize X_train_sub further:
    # X_train_sub, X_scale = normalize(X_train_sub, axis=0, copy=False, return_norm=True)
    # 2.1 if axis=0, transpose X:
    X = X.T
    # 2.2 calculate the norm of X by 'row_norms' function 
    # norms = row_norms(X)
    # equivalent to:
    norms = np.einsum('ij,ij->i', X, X)
    norms_square = np.sqrt(norms, norms)
    # 2.3 let the X divide by norms_sqaure
    X /= norms_square[:, np.newaxis]
    # 2.4 transpose X again
    X = X.T
    y_offset = np.average(y, axis=0)
    y = y - y_offset
    return X, y

In [9]:
lm3 = LinearRegression(normalize=False).fit(normalize_self(X_train, y_train)[0], normalize_self(X_train, y_train)[1])
lm3.coef_

array([ 2.04594395e+07, -3.95058620e+07,  2.00654260e+07,  5.13758038e+06,
       -1.89706130e+07,  1.35474143e+07, -7.70593496e+20, -4.73124074e+20,
       -6.72705113e+20])

In [11]:
normalize_self(X_train, y_train)

(          Rooms  Distance  Bathroom       Car  Lattitude  Longtitude  \
 9651  -0.000533 -0.006129 -0.006651 -0.005699  -0.004636   -0.000854   
 24684  0.016049  0.009645  0.004486  0.018077   0.000772    0.014505   
 8711  -0.008825 -0.000300 -0.006651 -0.005699   0.009210    0.000103   
 29400 -0.000533  0.010445  0.004486 -0.005699   0.018963   -0.005799   
 5077   0.016049  0.002558  0.004486  0.002226  -0.011843    0.001287   
 ...         ...       ...       ...       ...        ...         ...   
 17932 -0.000533  0.000729 -0.006651  0.002226   0.010076    0.003677   
 28853 -0.008825 -0.008187 -0.006651 -0.005699   0.002852   -0.006770   
 8589  -0.000533 -0.000300 -0.006651  0.002226   0.007119    0.001571   
 19663  0.007758  0.027363  0.004486  0.002226  -0.024663    0.008510   
 390    0.016049 -0.005901  0.004486  0.002226  -0.004857    0.001597   
 
          Type_h    Type_t    Type_u  
 9651  -0.013699 -0.002240  0.017268  
 24684  0.004473 -0.002240 -0.003548  
 8711

In [12]:
from sklearn.preprocessing import Normalizer
transformer = Normalizer().fit(X_train)
X_normalize = transformer.transform(X_train)
pd.DataFrame(data=X_normalize, columns=X_train.columns)

Unnamed: 0,Rooms,Distance,Bathroom,Car,Lattitude,Longtitude,Type_h,Type_t,Type_u
0,-0.011575,-0.964693,-0.107437,-0.129374,-0.009812,-0.002375,-0.135624,-0.013614,0.149238
1,0.215803,0.940711,0.044909,0.254290,0.001012,0.025009,0.027442,-0.008437,-0.019005
2,-0.581577,-0.143275,-0.326313,-0.392941,0.059209,0.000873,-0.411925,-0.041350,0.453275
3,-0.007006,0.995027,0.043863,-0.078306,0.024295,-0.009766,0.026802,-0.008240,-0.018562
4,0.641283,0.741380,0.133454,0.093056,-0.046164,0.006592,0.081546,-0.025071,-0.056475
...,...,...,...,...,...,...,...,...,...
16315,-0.065696,0.651252,-0.609782,0.286809,0.121050,0.058068,0.251333,-0.077271,-0.174063
16316,-0.143820,-0.967799,-0.080695,-0.097171,0.004535,-0.014148,-0.101866,0.124900,-0.023034
16317,-0.082283,-0.335339,-0.763744,0.359225,0.107120,0.031075,0.314792,-0.096781,-0.218011
16318,0.039042,0.998856,0.016808,0.011720,-0.012108,0.005492,0.010271,-0.003158,-0.007113


In [13]:
lm4 = LinearRegression(normalize=False).fit(X_normalize,y_train)
lm4.coef_

array([  634270.87604601,  -252067.66154557,   410616.93139885,
         163113.64159544, -4102122.36399583,  3107704.69824524,
         567598.2051451 ,   -49721.41807039,  -517876.78707471])

In [14]:
from sklearn.preprocessing import normalize
X_offset = np.average(X_train, axis=0)
X_train_2 = X_train - X_offset
X_train_2, X_scale = normalize(X_train_2, axis=0, copy=False, return_norm=True)

In [15]:
pd.DataFrame(data=X_train_2, columns=X_train.columns)

Unnamed: 0,Rooms,Distance,Bathroom,Car,Lattitude,Longtitude,Type_h,Type_t,Type_u
0,-0.000533,-0.006129,-0.006651,-0.005699,-0.004636,-0.000854,-0.013699,-0.002240,0.017268
1,0.016049,0.009645,0.004486,0.018077,0.000772,0.014505,0.004473,-0.002240,-0.003548
2,-0.008825,-0.000300,-0.006651,-0.005699,0.009210,0.000103,-0.013699,-0.002240,0.017268
3,-0.000533,0.010445,0.004486,-0.005699,0.018963,-0.005799,0.004473,-0.002240,-0.003548
4,0.016049,0.002558,0.004486,0.002226,-0.011843,0.001287,0.004473,-0.002240,-0.003548
...,...,...,...,...,...,...,...,...,...
16315,-0.000533,0.000729,-0.006651,0.002226,0.010076,0.003677,0.004473,-0.002240,-0.003548
16316,-0.008825,-0.008187,-0.006651,-0.005699,0.002852,-0.006770,-0.013699,0.027358,-0.003548
16317,-0.000533,-0.000300,-0.006651,0.002226,0.007119,0.001571,0.004473,-0.002240,-0.003548
16318,0.007758,0.027363,0.004486,0.002226,-0.024663,0.008510,0.004473,-0.002240,-0.003548


In [16]:
lm5 = LinearRegression(normalize=False).fit(X_train_2,y_train)
lm5.coef_

array([ 2.06298463e+07, -3.94650261e+07,  2.00132403e+07,  5.17981505e+06,
       -1.90042799e+07,  1.35766024e+07, -4.48598796e+20, -2.75427825e+20,
       -3.91613355e+20])

In [11]:
positive = False
if positive:
    a = 1
else:
    a = 2
a

2

In [22]:
coef_, residues, rank_, singular_ = np.linalg.lstsq(X_train_2, y_train)
coef_ = coef_.T
coef_ = np.ravel(coef_)
coef_

array([ 2.88513040e+07, -4.09472440e+07,  2.13215165e+07,  3.92483011e+06,
       -2.34799509e+07,  7.93346442e+06,  1.24482969e+22,  7.64292588e+21,
        1.08669916e+22])