In [60]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
import matplotlib as plt
import seaborn as sns
import patsy
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
import time, os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [61]:
# **Reading Riyadh Real Estate Dataset** 

In [62]:
columns =['neighborhood', 'regions', 'city', 'RealEstateType', 'square_meters', 'rooms_number', 'baths_number','prices']
df = pd.read_csv(r"../input/riyadh-real-estates/realstate.csv",sep=',',header = 0,names=columns)


In [63]:
df.shape

In [64]:
df.head()

# Data Cleaning and Convert Strings to int and float

In [65]:
df.dropna(inplace=True)

In [66]:
# Drop unnessery data 
df.drop(columns='city', inplace=True)

In [67]:
df.shape

In [68]:
df["prices"]=[i.replace(',','') for i in df["prices"]]


In [69]:
df["square_meters"]=[i.replace(',','') for i in df["square_meters"]]

In [70]:
df["square_meters"]=df["square_meters"].astype('int64')

In [71]:
df["rooms_number"]=df["rooms_number"].astype('int64')
df["baths_number"]=df["baths_number"].astype('int64')
df["prices"]=df["prices"].astype('float64')

In [72]:
de= df["square_meters"].describe()
mean = de[1]
std= de[2]
median = de[5]
print('mean= ',mean, 'std= ',std,'median= ', median)
q1=de[4]
q2=de[5]
q3=de[6]
print ('q1= ',q1,'q2= ',q2,'q3= ',q3)
IQR=q3-q1
lower_bound= q1-1.5*IQR
upper_bound= q3+1.5*IQR
print("Anything outside this range is an outlier:(",lower_bound,",",upper_bound,")")
df[df["square_meters"]<lower_bound]["square_meters"]
print('outliers',df[(df["square_meters"]< lower_bound)|(df["square_meters"]>upper_bound)]["square_meters"].values)

In [73]:
df_cleaned =df[(df["square_meters"]< lower_bound)|(df["square_meters"]>upper_bound)==False]
print('dataset befor remove outliers ',df.shape)
print('dataset after remove outliers ',df_cleaned.shape)

In [74]:
de= df_cleaned["prices"].describe()
mean = de[1]
std= de[2]
median = de[5]
print('mean= ',mean, 'std= ',std,'median= ', median)
q1=de[4]
q2=de[5]
q3=de[6]
print ('q1= ',q1,'q2= ',q2,'q3= ',q3)
IQR=q3-q1
lower_bound= q1-1.5*IQR
upper_bound= q3+1.5*IQR
print("Anything outside this range is an outlier:(",lower_bound,",",upper_bound,")")
df_cleaned[df_cleaned["prices"]<lower_bound]["prices"]
print('outliers',df_cleaned[(df_cleaned["prices"]< lower_bound)|(df_cleaned["prices"]>upper_bound)]["prices"].values)

In [75]:
df_cleaned1 =df_cleaned[(df_cleaned["prices"]< lower_bound)|(df_cleaned["prices"]>upper_bound)==False]
print('dataset befor remove outliers ',df_cleaned.shape)
print('dataset after remove outliers ',df_cleaned1.shape)

In [76]:
#plt.figure(figsize=(5,7))
boxplot=df_cleaned1.boxplot(column='prices')
#plt.show()
boxplot

In [77]:
sns_plot=df_cleaned1['square_meters'].plot(kind='line');
sns.despine()
fig = sns_plot.get_figure()
fig.savefig("rosm.png")

In [78]:
df_cleaned1.head()

In [79]:
df_cleaned1.info()

# EDA

In [80]:
from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)
df_cleaned1.to_sql("dfc_t", con=engine)

In [81]:
dfc_data=pd.read_sql('SELECT regions, count(regions) FROM dfc_t group by regions order by count(regions) desc  ', engine)
dfc_data

In [82]:
dfc_data.plot(kind='bar', color=['pink']);


In [83]:
df_cleaned1['regions' ].value_counts()

In [84]:
sns_plot=df_cleaned1['regions'].value_counts().plot(kind='bar', color=['yellowgreen','mediumorchid','sandybrown','steelblue','yellow','pink']);
fig = sns_plot.get_figure()
fig.savefig("regions.png")

In [85]:
df_cleaned1['RealEstateType'].value_counts()

In [86]:

sns_plot=df_cleaned1['RealEstateType'].value_counts().plot(kind='bar', color=['yellowgreen','mediumorchid','sandybrown','steelblue']);
sns.despine()
sns.set(rc={'figure.figsize':(11.7,8.27)})
fig = sns_plot.get_figure()
sns.despine()
fig.savefig("types.png")                   

# linear Regression

In [87]:
df_cleaned1['RealEstateTypeCode']=pd.factorize(df_cleaned1['RealEstateType'])[0]

In [88]:
df_cleaned1['RegionsCode']=pd.factorize(df_cleaned1['regions'])[0]

In [89]:
df_cleaned1

In [90]:
df_cleaned1.columns

In [91]:
sns.heatmap(df_cleaned1.corr(),annot=True)

In [92]:
df_=pd.get_dummies(df_cleaned1,columns=["regions","RealEstateType"])
df_

In [93]:
df_.columns

In [94]:
sns.pairplot(df_, plot_kws=dict(alpha=.1, edgecolor='none'))

In [95]:
sns.pairplot(df_cleaned1, plot_kws=dict(alpha=.1, edgecolor='none'))


In [96]:
X=df_.loc[:,[ 'square_meters',
       'rooms_number', 'baths_number', 'regions_ Central Riyadh', 'regions_ East Riyadh',
       'regions_ North Riyadh', 'regions_ Riyadh Region',
       'regions_ South Riyadh', 'regions_ West Riyadh',
       'RealEstateType_Apartment', 'RealEstateType_Floor',
       'RealEstateType_Residential Building', 'RealEstateType_Villa']]
y=df_cleaned1['prices']
model=sm.OLS(y,X,data=df_cleaned1)
results=model.fit()
results.summary()

In [97]:
from joblib import dump

In [98]:
lr = LinearRegression()
fit=lr.fit(X, y)
lr.score(X,y)

In [99]:
print(lr.intercept_)
lr.coef_

In [100]:
df_['predict']=fit.predict(X)
df_['resid']=df_.prices-df_.predict
with sns.axes_style('white'):
    plot=df_.plot(kind='scatter',
                  x='predict',y='resid',alpha=0.2,figsize=(10,6))

In [101]:
results.resid.plot(style='o',figsize=(12,8));

In [102]:
stats.probplot(df_['resid'], dist="norm", plot=plt)
plt.title("Normal Q-Q plot")
plt.show()

# Baseline model

In [103]:
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge #ordinary linear regression + w/ ridge regularization
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score

In [104]:
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)

In [105]:
lm = LinearRegression() # construct LinearRegression model object
fit=lm.fit(X, y)
print(np.mean(cross_val_score(lm, X, y, cv=5, scoring='r2')))

In [106]:
X1=df_cleaned1.loc[:,[ 'square_meters','rooms_number', 'baths_number', 'RealEstateTypeCode','RegionsCode']]
y1=df_cleaned1['prices']

X1, X_test1, y1, y_test1 = train_test_split(X, y, test_size=.2, random_state=10)

In [107]:
lm = LinearRegression() # construct LinearRegression model object
fit=lm.fit(X1, y1)
print(np.mean(cross_val_score(lm, X, y, cv=5, scoring='r2')))

In [108]:
poly = PolynomialFeatures(degree=2) 

X_Cross_poly = poly.fit_transform(X.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression() # construct LinearRegression model object
lm_poly.fit(X_Cross_poly, y)
print(np.mean(cross_val_score(lm_poly, X, y, cv=5, scoring='r2')))

In [109]:
preds = lm.predict(X) # generate predictions (on training data) using fit model

sns.jointplot(x=preds,y=y, kind='reg')

In [110]:
model=sm.OLS(y,X,data=df_cleaned)
results=model.fit()
results.summary()

# improveing our model 

In [111]:
# try to improve our model using LASSO
c=[  'square_meters',
       'rooms_number', 'baths_number', 'regions_ Central Riyadh', 'regions_ East Riyadh',
       'regions_ North Riyadh', 'regions_ Riyadh Region',
       'regions_ South Riyadh', 'regions_ West Riyadh',
       'RealEstateType_Apartment', 'RealEstateType_Floor',
       'RealEstateType_Residential Building', 'RealEstateType_Villa']
lasso_model = Lasso(alpha = 100000) # this is a VERY HIGH regularization strength!, wouldn't usually be used
lasso_model.fit(X1.loc[:,c], y1)
list(zip(c, lasso_model.coef_))

In [112]:
X1=df_cleaned1.loc[:,[ 'square_meters', 'baths_number','RegionsCode']]
y1=df_cleaned1['prices']

X1, X_test1, y1, y_test1 = train_test_split(X1, y1, test_size=.2, random_state=10)

In [113]:
lm1 = LinearRegression() # construct LinearRegression model object
lm1.fit(X1, y1)
print(np.mean(cross_val_score(lm1, X1, y1, cv=5, scoring='r2')))

In [114]:
preds1 = lm1.predict(X1) # generate predictions (on training data) using fit model

sns.jointplot(x=preds1,y=y1, kind='reg')

In [115]:
# try to improve our model using RIDGE
X_train_collinear = X.loc[:,c]

X_train_collinear.corr()
lr_model_ridge = Ridge(alpha = 10000000)
lr_model_ridge.fit(X_train_collinear, y)

list(zip(X_train_collinear.columns, lr_model_ridge.coef_))

In [116]:
X2=df_cleaned1.loc[:,[ 'square_meters','rooms_number', 'baths_number']]
y=df_cleaned1['prices']
lm2 = LinearRegression() # construct LinearRegression model object
lm2.fit(X2, y)
print(np.mean(cross_val_score(lm2, X2, y, cv=5, scoring='r2')))

In [117]:
preds2 = lm2.predict(X2) # generate predictions (on training data) using fit model

sns.jointplot(x=preds2,y=y, kind='reg')

# Testing our model

In [121]:
lm_t = LinearRegression()
lm_t.fit(X_test, y_test)
test_pred=lm_t.predict(X_test)
sns.jointplot(x=test_pred,y=y_test, kind='reg')

In [119]:

print(f'Linear Regression test R^2: {lm.score(X_test, y_test):.3f}')

In [None]:
print(lm.intercept_)
print(lm.coef_)