## Dragon Real Estate - Price Predictor

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

In [2]:
housing = pd.read_csv("data.csv")

In [3]:
housing.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       501 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [None]:
housing['chas'].value_counts()

In [None]:
housing.describe()

In [None]:
"""%matplotlib inline
housing.hist(bins=50,figsize=(20,15))"""

## Train-Test Splitting

In [None]:
"""def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    print(shuffled)
    test_set_size=int (len(data)* test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size : ]
    return data.iloc[train_indices],data.iloc[test_indices]"""

In [None]:
#train_set ,test_set = split_train_test(housing,0.2)

In [None]:
#print(f"Rows in train set : {len(train_set)}\nRows in test set :{len(test_set)}\n")

In [None]:
from sklearn.model_selection import train_test_split
train_set ,test_set=train_test_split(housing,test_size=0.2,random_state=42)
print(f"Rows in train set : {len(train_set)}\nRows in test set :{len(test_set)}\n")

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
spilt = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in spilt.split(housing, housing['chas']):
    strat_train_set =housing.loc[train_index]
    strat_test_set=housing.loc[test_index]


In [None]:
housing=strat_train_set.copy()

In [None]:
strat_train_set.describe()

In [None]:
strat_train_set['chas'].value_counts()

## Looking for correletions

In [None]:
corr_matrix = housing.corr()
corr_matrix['medv'].sort_values(ascending=False)

In [None]:
# 1 means strong correlations

In [None]:
from pandas.plotting import scatter_matrix
attributes =["medv","rm","zn","lstat"]
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
housing.plot(kind='scatter',x='rm',y='medv',alpha=0.5)

In [None]:
housing.plot(kind='scatter',x='lstat',y='medv',alpha=0.5)

## Trying out Attribute combinations 

In [None]:
housing['TAXRM']=housing['tax']/housing['rm']

In [None]:
housing.plot(kind='scatter',x='TAXRM',y='medv',alpha=0.5)#REMOVE OUTLIER

## Handling MISSING values

In [None]:
housing.describe()

In [None]:
#rm value are missing .To take care of missing values ,Three options
#1.Get rid of missing data points
#2.Get rid of whole attribute
#3.Mean imputation,meadian imputaions

In [None]:
#drop values
a=housing.dropna(subset=["rm"]).shape
#a.shape

In [None]:
housing.drop("rm",axis=1).shape
#rm caloum get dropped and originalhousing data will not get changed 

In [None]:
median=housing['rm'].median()

In [None]:
#Option 3
housing["rm"].fillna(median)
#originalhousing data will not get changed and if you want to change set implace= true

In [None]:
housing.describe()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

In [None]:
imputer.statistics_

In [None]:
imputer.statistics_.shape

In [None]:
x= imputer.transform(housing)

In [None]:
housing_tr = pd.DataFrame(x,columns=housing.columns)

In [None]:
housing_tr.describe()
#we have imputed median and fitted into our data

## Scikit-learn Design

Primarily 3 types of objects 
1. Estimators - ex : Imputer- It has fit method and transform method.  Fit - fits in dataset and calculates internal parameters

2. Transforms transform - it takes input and returns transformed output. fit_transform() method 

3. Predictors : Linear Regression ,KNN. Two funstions:-fit(),predict(), score function

## Creating a Pipeline

In [None]:
from sklearn.pipeline import Pipeline
