In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

KC=pd.read_csv("kc_house_data.csv")
KC

In [None]:
KC.info()
KC.isnull().sum()

In [None]:
#iterate through the columns to see the frequency of different values
for i in KC.columns:
    print(KC[i].value_counts())

In [None]:
stats = KC.describe()
stats=stats.transpose()
pd.DataFrame(stats)

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(KC.corr(), annot=True, cmap='RdBu')
plt.title("Correlation kc_house")

In [None]:
fig=plt.figure(figsize=(25,15)) 

plt.subplot2grid ((2,3),(0,0))
KC.corr().sort_values('sqft_living').drop('sqft_living')['sqft_living'].plot(kind='bar', title='Correlation with house sqft living space')
                 
plt.subplot2grid ((2,3),(0,1))
KC.corr().sort_values('price').drop('price')['price'].plot(kind='bar', title='Correlation with house prices')

In [None]:

fig=plt.figure(figsize=(30,20))   
plt.subplot2grid ((2,3),(0,0))
sns.scatterplot(x='sqft_living', y='price', data=KC).set(xlabel='Sqft Living Space', ylabel='Price of the house')

plt.subplot2grid ((2,3),(0,1))
sns.scatterplot(x='grade', y='price', data=KC).set(xlabel='grade', ylabel='Price of the house')

plt.subplot2grid ((2,3),(1,0),colspan=1) 
sns.scatterplot(x='sqft_above', y='price', data=KC).set(xlabel='sqft_above', ylabel='Price of the house')

plt.subplot2grid ((2,3),(1,1),colspan=1) 
sns.scatterplot(x='sqft_living15', y='price', data=KC).set(xlabel='sqft_living15', ylabel='Price of the house')

In [None]:
fig=plt.figure(figsize=(30,15))   

plt.subplot2grid ((2,3),(0,0))
sns.boxplot(x='bedrooms', y='price', data=KC).set(xlabel='Number of Bedrooms', ylabel='Price', title='Comparison of House price and number of bedrooms')

plt.subplot2grid ((2,3),(0,1))
sns.boxplot(x='floors', y='price', data=KC).set(xlabel='Number of floors', ylabel='Price', title='Comparison of House price and number of floors')

plt.subplot2grid ((2,3),(1,0),colspan=1) 
sns.boxplot(x='view', y='price', data=KC).set(xlabel='Number of views', ylabel='Price', title='Comparison of House price and number of views')

In [None]:
plt.figure(figsize=(8, 7))
sns.countplot(x='bedrooms', data=KC).set(ylabel='Count', title='Number of different houses depending on bedrooms', xlabel='Number of bedrooms')

In [None]:
fig=plt.figure(figsize=(25,15)) 

plt.subplot2grid ((2,3),(0,0))
sns.boxplot(x='waterfront', y='price', data=KC).set(xlabel='', ylabel='Price', title='Comparison of prices for houses having a waterfront or not',
                                                        xticklabels=['Do not have waterfront', 'Have waterfront'])

plt.subplot2grid ((2,3),(0,1))
sns.countplot(x='waterfront', data=KC).set(xlabel='', ylabel='Number of house', title='Number of waterfront and non waterfront houses',
                                             xticklabels=['Do not have waterfront', 'Have waterfront'])

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='long', y='lat', data=KC, hue='price', palette='cividis', alpha=0.15)

In [None]:
print('Percentage of houses priced below 3 million USD: ', len(KC[KC['price'] < 3000000]) / len(KC) * 100)
print('Percentage of houses priced below 2.5 million USD: ', len(KC[KC['price'] < 2500000]) / len(KC) * 100)
print('Percentage of houses priced below 2 million USD: ', len(KC[KC['price'] < 2000000]) / len(KC) * 100)
print('Percentage of houses priced below 1.5 million USD: ', len(KC[KC['price'] < 1500000]) / len(KC) * 100)

In [None]:

plt.figure(figsize=(10, 8))
sns.scatterplot(x='long', y='lat', data=KC[KC['price'] < 2000000], hue='price', palette='cividis', alpha=0.15)

In [None]:
#format the date
d =[]
for i in KC['date'].values:
    d.append(i[:4])
    
KC['date'] = d

# convert everything to same datatype
for i in KC.columns:
    KC[i]=KC[i].astype(float)
    
#make a new column age of the house  
KC['age'] = KC['date'] - KC['yr_built']

#calculate the total years of renovation
KC['renov_age'] = np.abs(KC['yr_renovated'] - KC['yr_built'])
KC['renov_age'] = KC.renov_age.apply(lambda x: x if len(str(int(x)))==2 else 0.0)

#remove unwanted columns like yr_built, date, id
KC.drop(['id','zipcode' ,'date', 'yr_built', 'yr_renovated'], axis=1, inplace=True)
KC.head()