In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 


In [None]:
df=pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

In [None]:
df

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.describe().transpose()

In [None]:
df.isnull().sum()

"""
 As we can see that there are no null data entry 
 

## The Data(insight )


#### Feature Columns
 
* id - Unique ID for each home sold
* date - Date of the home sale
* price - Price of each home sold
* bedrooms - Number of bedrooms
* bathrooms - Number of bathrooms, where .5 accounts for a room with a toilet but no shower
* sqft_living - Square footage of the apartments interior living space
* sqft_lot - Square footage of the land space
* floors - Number of floors
* waterfront - A dummy variable for whether the apartment was overlooking the waterfront or not
* view - An index from 0 to 4 of how good the view of the property was
* condition - An index from 1 to 5 on the condition of the apartment,
* grade - An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design.
* sqft_above - The square footage of the interior housing space that is above ground level
* sqft_basement - The square footage of the interior housing space that is below ground level
* yr_built - The year the house was initially built
* yr_renovated - The year of the house’s last renovation
* zipcode - What zipcode area the house is in
* lat - Lattitude
* long - Longitude
* sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors
* sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors

"""

**Exploratory data analysis **

In [None]:
plt.figure(figsize=(12,12),edgecolor='yellow')
sns.distplot(df['price'])

In [None]:
sns.countplot(df['bedrooms'])

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='price',y='sqft_living',data=df)

In [None]:
sns.boxplot(x='bedrooms',y='price',data=df)

**Geographical Properties****

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='price',y='long',data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='price',y='lat',data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='long',y='lat',data=df,hue='price')

In [None]:
df.sort_values('price',ascending=False).head(20)

In [None]:
len(df)*(0.01)

In [None]:
non_top_1_perc = df.sort_values('price',ascending=False).iloc[216:]

In [None]:
non_top_1_perc

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='long',y='lat',
                data=non_top_1_perc,hue='price',
                palette='RdYlGn',edgecolor=None,alpha=0.2)

**OTHER FEATURES **

In [None]:
sns.boxplot(x='waterfront',y='price',data=df)

**WORKING WITH FEATURE DATA **

In [None]:
df.head()

In [None]:
df.info()

**here we dont need the id col becuase we can use the predefined index as a way to act as the primary key of the data  **

In [None]:
df.drop(['id'],axis=1,inplace=True)

In [None]:
df.head(5)

**Feature Engineering from date**

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['month'] = df['date'].apply(lambda date:date.month)

In [None]:
df['year'] = df['date'].apply(lambda date:date.year)

In [None]:
sns.boxplot(x='year',y='price',data=df)

In [None]:
sns.boxplot(x='month',y='price',data=df)

In [None]:
df.groupby('month').mean()['price'].plot()

In [None]:
df.groupby('year').mean()['price'].plot()

**NOW WE DONT NEED THE DATE **

In [None]:
df

In [None]:
df.columns

In [None]:
df['zipcode'].value_counts()

In [None]:
df = df.drop('zipcode',axis=1)

In [None]:
df.head()

In [None]:
df['yr_renovated'].value_counts()

In [None]:
df['sqft_basement'].value_counts()

**SCALING AND TRAIN_TEST_SPLIT**