In [2]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
 "../input/" directory
# n
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
df1=pd.read_csv("/kaggle/input/housesalesprediction/kc_house_data.csv")

In [4]:
df1.head()

In [5]:
df1.describe()

In [6]:
df1.isnull().any()

In [7]:
df1.drop(['id','date'],axis=1,inplace=True)
#Dropping ID and Date from the data since they don't seem to contribute to the model

In [8]:
df1.head()

Understanding all the features of the Data 

In [9]:
df1.info()

In [10]:
df1["bedrooms"].astype("category").value_counts()

In [11]:
df1["bathrooms"].astype("category").value_counts()

In [12]:
df1["floors"].astype("category").value_counts()

In [13]:
df1["condition"].astype("category").value_counts()

In [14]:
df1["grade"].astype("category").value_counts()

In [15]:
for cols in df1.columns:
    print(sns.displot(df1[cols]))

In [16]:
plt.figure(figsize=(20,10))
sns.heatmap(df1.corr(),cmap="Spectral",annot=True)

Price is positively correlated with the **No of bedrooms,bathrooms,floors,grade,sqft_above and sqft_living15**

# **Data Cleaning**

Cleaning of Data after analysing. Outlier cleaning, datatype changes etc

Since there are no null values, we don't need to fill/impute any values



In [17]:
df1["If_Renovated"]= np.where(df1["yr_renovated"]!=0,1,0)
df1.drop("yr_renovated",axis=1,inplace=True)


In [17]:
df1.head(20)

In [18]:
from scipy import stats
sns.set_style("dark")
for col in ["bedrooms",'bathrooms','sqft_living','floors','condition','grade','waterfront','view']:
    plt.figure(figsize=(15,4))
    plt.subplot(131)
    sns.distplot(df1[col],label="Skew:"+str(np.round(df1[col].skew(),2)))
    plt.subplot(132)
    sns.boxplot(x=df1[col],y=df1['price'])
    plt.subplot(133)
    sns.countplot(x=col,data=df1)
    plt.tight_layout()
    plt.show()

In [19]:
plt.figure(figsize=(16,5))
plt.subplot(121)
sns.boxplot(x=df1["bedrooms"],y=df1["price"])
plt.xlabel("Bedrooms")
plt.title("Boxplot of Bedroom count vs price")
plt.legend()
plt.subplot(122)
sns.boxplot(x=df1["bathrooms"],y=df1["price"])

In [41]:
for col in df1.columns:
    print(f"The Skewness of {col} is {np.round(df1[col].skew(),2)} and the max Z-Score is {np.max(stats.zscore(df1[col]))}" )
    print(f"Quantiles of {col}:",df1[col].quantile([0.05, 0.25, 0.5, 0.9,0.95,0.96,0.97,0.98,0.99,1.0]))

In [25]:
print(df1.loc[df1.bedrooms>11])
#This value of bedrooms as 33 with a sqft_living of 1620 is impossible. So,modifying the value of bedrooms to 3.


In [28]:
df1.loc[df1['bedrooms']==33,'bedrooms']=3

In [32]:
print(df1.loc[df1.bathrooms>6])
#All data seems to be right

In [35]:
print(df1.loc[df1.sqft_living>10000])
#sqft_lot seems to be too big compared to the price.I'll remove it as outlier

In [39]:
df1.drop(df1.loc[df1.sqft_living>12500].index,inplace=True)
print(df1.loc[df1.sqft_living>10000])

In [40]:
print(df1.loc[df1.sqft_lot>1000000]) #No outliers

In [42]:
print(df1.loc[df1.price>7000000])#Seems to be fine

In [18]:
df1.nunique()

# Data Preparation

In [30]:
y=df1.price
df1.drop("price",axis=1,inplace=True)

In [32]:
x=df1
x.head()

# VIF Calculation

In [33]:
# Let's check multicollinearity 
# to calculated VIF we can use the loop
Default_indep=x

import statsmodels.api as sm
features=list(Default_indep.columns)
for i in range(1, len(Default_indep.columns)):
    X=Default_indep.loc[:,Default_indep.columns!=features[i]]
    Y=Default_indep.loc[:,Default_indep.columns==features[i]]
    model=sm.OLS(Y,X)
    results=model.fit()

    rsq=results.rsquared
    vif=round(1/(1-rsq),2)
    print ("R-Sqr value of {} is {}".format(features[i],rsq))
    print ('VIF for {} is {}'.format(features[i],vif))
    
#We can remove the features with high VIF value in an iterative process. Or use PCA to reduce the dimensionality