In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
df = pd.read_csv('houses_train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,5546,130000.0,newly repaired,Center,4,Sayat Nova Ave,3,Yerevan,96.0,http://www.myrealty.am/en/item/28244/3-senyaka...,1,stone,3,3.2
1,2979,65000.0,good,Arabkir,5,Hr.Kochar St,3,Yerevan,78.0,http://www.myrealty.am/en/item/18029/3-senyaka...,1,stone,2,2.8
2,2698,129000.0,good,Center,10,M.Khorenatsi St,3,Yerevan,90.0,http://www.myrealty.am/en/item/37797/3-senyaka...,1,panel,3,2.8
3,4548,52000.0,newly repaired,Center,14,Argishti St,2,Yerevan,53.0,http://www.myrealty.am/en/item/36153/2-senyaka...,1,monolit,5,3.0
4,2982,65000.0,newly repaired,Center,12,Mashtots Ave,2,Yerevan,47.0,http://www.myrealty.am/en/item/17566/2-senyaka...,1,panel,3,2.8


In [4]:
df.drop(1189,axis=0,inplace=True)
df.drop(['region','Unnamed: 0','url'],inplace=True,axis=1) #removing extra useless information
data = pd.get_dummies(df,columns=['street','condition','district','building_type'])

In [9]:
X = data.drop(['price'],axis=1)
y = np.log(data['price'])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=20)

In [13]:
norm = MinMaxScaler().fit(X_train)
X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)


X_train_stand = X_train.copy()
X_test_stand = X_test.copy()

# numerical features
num_cols = ['max_floor', 'num_rooms', 'area', 'num_bathrooms', 'floor','ceiling_height']

# apply standardization on numerical features
for i in num_cols:
    
    # fit on training data column
    scale = StandardScaler().fit(X_train_stand[[i]])
    
    # transform the training data column
    X_train_stand[i] = scale.transform(X_train_stand[[i]])
    
    # transform the testing data column
    X_test_stand[i] = scale.transform(X_test_stand[[i]])

In [14]:
lr = LinearRegression()
rmse = []
r_squared = []

trainX = [X_train, X_train_norm, X_train_stand]
testX = [X_test, X_test_norm, X_test_stand]

for i in range(len(trainX)):
    lr.fit(trainX[i],y_train)
    pred = lr.predict(testX[i])
    rmse.append(np.sqrt(mean_squared_error(y_test,pred)))
    r_squared.append(metrics.r2_score(y_test,pred))
    
df_lr = pd.DataFrame({'RMSE':rmse, 'R^2':r_squared},index=['Original','Normalized','Standardized'])
df_lr

Unnamed: 0,RMSE,R^2
Original,59566040.0,-1.259724e+16
Normalized,27330460000.0,-2.651988e+21
Standardized,5684106000.0,-1.147102e+20


In [17]:
ridge = Ridge()
params = {'alpha': [1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_reg = GridSearchCV(ridge,params,scoring='neg_mean_squared_error',cv=5)
ridge_reg.fit(X_train_stand,y_train)
ridge_pred = ridge_reg.predict(X_test_stand)
print('Ridge with standardized data:')
print('Best alpha is:',ridge_reg.best_params_)
print('Best score is:',ridge_reg.best_score_)
print('RMSE for Ridge:',np.sqrt(mean_squared_error(y_test,ridge_pred)))
print('R_2 score:',metrics.r2_score(y_test,ridge_pred))

Ridge with standardized data:
Best alpha is: {'alpha': 1}
Best score is: -0.04332879261017016
RMSE for Ridge: 0.2050550648094895
R_2 score: 0.850714104577109
