In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm, skew

In [None]:
df=pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
df

In [None]:
df.describe()

'id' and 'date' features will not be needed so and dropped 

In [None]:
df.drop(['id','date'],axis=1,inplace=True)

In [None]:
df

Correlation matrix
Let's look at the correlations among the numerical variables in our dataset. This information is important to know as there are Machine Learning algorithms (for example, linear and logistic regression) that do not handle highly correlated input variables well.

First, we will use the method corr() on a DataFrame that calculates the correlation between each pair of features. Then, we pass the resulting correlation matrix to heatmap() from seaborn, which renders a color-coded matrix for the provided values:

In [None]:
plt.figure(figsize=(14,25))
sns.heatmap(df.corr(),annot=True )

In [None]:
fig = plt.figure(figsize=(16,5))
fig.add_subplot(2,2,1)
sns.countplot(df['bedrooms'])
fig.add_subplot(2,2,2)
sns.countplot(df['grade'])
fig.add_subplot(2,2,3)
sns.countplot(df['waterfront'])
fig.add_subplot(2,2,4)
sns.countplot(df['floors'])

Scatterplot matrix
In some cases, we may want to plot a scatterplot matrix such as the one shown below. Its diagonal contains the distributions of the corresponding variables, and the scatter plots for each pair of variables fill the rest of the matrix.

In [None]:
fig = plt.figure(figsize=(16,5))
fig.add_subplot(2,2,1)
sns.scatterplot(df['sqft_living15'], df.price)
fig.add_subplot(2,2,2)
sns.scatterplot(df['sqft_lot15'],df.price)
fig.add_subplot(2,2,3)
sns.scatterplot(df['sqft_above'],df.price)
fig.add_subplot(2,2,4)
sns.scatterplot(df['yr_built'],df.price)

Scatter plot
The scatter plot displays values of two numerical variables as Cartesian coordinates in 2D space. Scatter plots in 3D are also possible.

In [None]:
pd.value_counts(df.yr_built)

In [None]:
sns.distplot(df['price'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df['price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Price distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(df['price'], plot=plt)
plt.show() 

The price is right skewed, Right-skewed distributions are also called positive-skew distributions. That’s because there is a long tail in the positive direction on the number line. The mean is also to the right of the peak.

To have more understanding about the data i will select the features that are more corrolated to the Price so i can view them 

In [None]:
cor = df.corr()
cor[cor['price']<0.05].index

In [None]:
cor = df.corr()
cor[cor['price']>0.3].index

In [None]:
house=df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'view', 'grade','sqft_above', 'sqft_basement', 'lat', 'sqft_living15']]

In [None]:
house.head()

In [None]:
sns.pairplot(house)

In [None]:
num_cols = house.select_dtypes(exclude=['object'])

fig = plt.figure(figsize=(20,8))

for col in range(len(num_cols.columns)):
    fig.add_subplot(2,5,col+1)
    sns.distplot(num_cols.iloc[:,col], hist=False, rug=True, kde_kws={'bw':0.1}, label='UV')
    plt.xlabel(num_cols.columns[col])

plt.tight_layout()

In [None]:
num_cols = house.select_dtypes(exclude=['object'])

fig = plt.figure(figsize=(20,8))

for col in range(len(num_cols.columns)):
    fig.add_subplot(2,5,col+1)
    sns.scatterplot(x=num_cols.iloc[:,col], y=house['price'])
    plt.xlabel(num_cols.columns[col])

plt.tight_layout()

In [None]:
num_cols = house.select_dtypes(exclude=['object'])

fig = plt.figure(figsize=(20,8))

for col in range(len(num_cols.columns)):
    fig.add_subplot(2,5,col+1)
    sns.regplot(x=num_cols.iloc[:,col], y=house['price'],x_estimator=np.mean, logx=True)
    plt.xlabel(num_cols.columns[col])

plt.tight_layout()

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(house.corr(), annot=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from math import sqrt
from sklearn.linear_model import LinearRegression, 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

In [None]:
X = df.drop(['price'], axis=1)
y = df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
log_clf=LinearRegression()
rnd_clf = RandomForestRegressor()
gbr_clf=GradientBoostingRegressor(n_estimators=3000, learning_rate=0.1, max_depth=4, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='huber')
xgb_clf=XGBRegressor(n_estimators=3000)

In [None]:
X.head()

In [None]:
voting_clf = VotingRegressor([('lr', log_clf), ('rnd', rnd_clf),  ('gbr', gbr_clf),('xbg', xgb_clf)])
voting_clf.fit(X_train, y_train)

In [None]:
for clf in (log_clf, rnd_clf,voting_clf, gbr_clf,xgb_clf):
    clf.fit(X_train, y_train)
    k = X_test.shape[1]
    n = len(X_test)
    y_predition = clf.predict(X_test)
    RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predition)) , '.3f'))
    MSE = mean_squared_error(y_test, y_predition).round(3)
    MAE = mean_absolute_error(y_test, y_predition).round(3)
    r2 = r2_score(y_test, y_predition).round(3)
    adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
    MAPE = np.mean( np.abs((y_test - y_predition) /y_test ) ) * 100
    print(clf.__class__.__name__, '\nr2_score', r2, '\nRMSE =',RMSE, '\nMSE =',MSE, 
          '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2, '\nMean Absolute Percentage Error =',MAPE, '%')
    

# ****The GradientBoostingRegressor  seems to perform best with r2_score 0.907 and Adjusted R2 = 0.9066110594795539 following this i will try to scale the data to see if it will imorove **

In [None]:
import sklearn.preprocessing as preproc

In [None]:
X2 = preproc.PolynomialFeatures(include_bias=False).fit_transform(X)

In [None]:
X_trai, X_tes, y_trai, y_tes = train_test_split(X2, y, test_size=0.22, random_state=42)

In [None]:
voting_clf = VotingRegressor([('lr', log_clf), ('rnd', rnd_clf),  ('gbr', gbr_clf),('xbg', xgb_clf)])
voting_clf.fit(X_trai, y_trai)

In [None]:
k = X_tes.shape[1]
n = len(X_tes)

In [None]:
for clf in (log_clf, rnd_clf,voting_clf, gbr_clf,xgb_clf):
    clf.fit(X_trai, y_trai)
    k = X_tes.shape[1]
    n = len(X_tes)
    y_pred = clf.predict(X_tes)
    RMSE = float(format(np.sqrt(mean_squared_error(y_tes, y_pred)) , '.3f'))
    MSE = mean_squared_error(y_tes, y_pred).round(3)
    MAE = mean_absolute_error(y_tes, y_pred).round(3)
    r2 = r2_score(y_tes, y_pred).round(3)
    adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
    MAPE = np.mean( np.abs((y_tes - y_pred) /y_test ) ) * 100
    print(clf.__class__.__name__, '\nr2_score', r2, '/nRMSE =',RMSE, '\nMSE =',MSE, 
          '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2, '\nMean Absolute Percentage Error =',MAPE, '%')

# **This process does not have postive influence on the score so I will prefer the first without the preprocessing ******