<a href="https://colab.research.google.com/github/Nithinps021/DeepLearning/blob/master/hands__on_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import os
import tarfile
from six.moves import urllib

In [40]:
DOWNLOAD_ROOT="https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH='datasets/housing'
HOUSING_URL=DOWNLOAD_ROOT + HOUSING_PATH + '/housing.tgz'

def fetch_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
  if not os.path.isdir(housing_path):
    os.makedirs(housing_path)
  tgz_path=os.path.join(housing_path,'housing.tgz')
  urllib.request.urlretrieve(housing_url,tgz_path)
  housing_tgz=tarfile.open(tgz_path)
  housing_tgz.extractall(path='sample_data/'+housing_path)
  housing_tgz.close()
  
fetch_data()


In [41]:
import pandas as pd

def loadHousingData(housing_path='sample_data/'+HOUSING_PATH):
  path_housing_csv=os.path.join(housing_path,'housing.csv')
  return pd.read_csv(path_housing_csv)

In [None]:
housingData=loadHousingData()
housingData=housingData[housingData['ocean_proximity']!='ISLAND']
housingData.describe()

In [None]:
housingData.columns
# housingData["median_income"].value_counts()

In [None]:
import matplotlib.pyplot as plt

housingData.hist(bins=50,figsize=(20,15),color='green')
# plt.show()

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit


# def shuffle_data(data,testSize):
#   shuffled_indices=np.random.permutation(len(data))
#   test_size=int(len(data)*testSize)
#   test_indices=shuffled_indices[:test_size]
#   train_indices=shuffled_indices[test_size:]
#   return data.iloc[test_indices] ,  data.iloc[train_indices]

# testData,trainData = shuffle_data(housingData,0.2)

# using sklearn train_test_split method

# from sklearn.model_selection import train_test_split
# train_data, test_data=train_test_split(housingData,test_size=0.2,random_state=42)

# print(len(train_data),len(test_data))


# StratifiedShuffleSplit

housingData['income_cat']=np.ceil(housingData['median_income']/1.5)
housingData['income_cat'].where(housingData['income_cat']<5,5.0,inplace=True)
splitData=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for trainIndex,testIndex in splitData.split(housingData,housingData['income_cat']):
  train_data=housingData.iloc[trainIndex]
  test_data=housingData.iloc[testIndex]
for  i in (train_data,test_data):
  i.drop(['income_cat'],axis=1,inplace=True)
housingData=train_data.copy()

In [None]:

housingData.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
                 s=housingData['population']/100,label='population',
                 c='median_house_value',cmap=plt.get_cmap('jet'),colorbar=True
                 )
plt.legend()

In [None]:
# correlation matrix
cor_matrix=housingData.corr()
cor_matrix['median_income'].sort_values(ascending=False)

In [45]:
# separating training data and training labels

housingData=train_data.drop(['median_house_value'],axis=1)
housingLabel=train_data['median_house_value'].copy()
  

In [None]:
# data cleaning using pipeline
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,LabelBinarizer,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion

rooms,bedrooms,population,household=3,4,5,6

class CombineAttribute(BaseEstimator,TransformerMixin):
  def __init__(self,add_bedroom_per_room=True):
    self.add_bedroom_per_room=add_bedroom_per_room
  def fit(self,X,Y=None):
    return self
  def transform(self,X,Y=None):
    rooms_per_household=X[:,rooms]/X[:,household]
    population_per_household=X[:,population]/X[:,household]
    if self.add_bedroom_per_room:
      bedroom_per_room=X[:,bedrooms]/X[:,rooms]
      print("all added")
      return np.c_[X,bedroom_per_room,rooms_per_household,population_per_household]
    else:
      return np.c_[X,rooms_per_household,population_per_household]

class DataFrameSelector(BaseEstimator,TransformerMixin):
  def __init__(self,attribute_names):
    self.attribute_names=attribute_names
  def fit(self,X,Y=None):
    return self
  def transform(self,X,Y=None):
    return X[self.attribute_names].values
    
class CatBinarlabelizer(BaseEstimator,TransformerMixin):
  def __init__(self):
    self.encoder=LabelBinarizer()
  def fit(self,X,Y=None):
    self.encoder.fit(X)
    return self
  def transform(self,X,Y=None):
    return self.encoder.transform(X)

housing_num=housingData.drop("ocean_proximity",axis=1)
attributes=list(housing_num)
catAttribute=['ocean_proximity']

num_pipeline=Pipeline([
                       ('selector',DataFrameSelector(attributes)),
                       ('impute',SimpleImputer(strategy='median')),
                       ('attribue_adder',CombineAttribute()),
                       ('normalise',StandardScaler())
])


cat_pipeline=Pipeline([
                       ('selector',DataFrameSelector(catAttribute)),
                       ('labelBinarizer',CatBinarlabelizer())
])

combinePipeline=FeatureUnion(transformer_list=[
                                               ('num_pipeline',num_pipeline),
                                               ('cat_pipeline',cat_pipeline)
])

housingTraining=combinePipeline.fit_transform(housingData)
housingTraining.shape

In [None]:
# data cleaning


# imputer=SimpleImputer(strategy='median')
# housing_num=housingData.drop("ocean_proximity",axis=1)
# imputer.fit(housing_num)
# X=imputer.transform(housing_num)
# housing_tr=pd.DataFrame(X,columns=housing_num.columns)
# norm=StandardScaler()
# normalised=norm.fit_transform(housing_tr)
# housing_num[number].values


In [None]:
# text and categorical attributes

# housing_cat=housingData['ocean_proximity']

# # encoder=LabelEncoder()
# # housing_cat_encode=encoder.fit_transform(housing_cat)
# # hotEncoder=OneHotEncoder()
# # housing_cat_1hot=hotEncoder.fit_transform(housing_cat_encode.reshape(-1,1))
# # housing_cat_1hot.toarray()

# encoder=LabelBinarizer()
# housing_cat_1hot=encoder.fit_transform(housing_cat)
# housing_cat_1hot


In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

def DisplayScores(score):
  print("score : ",score)
  print("mean : ",score.mean())
  print("std : ",score.std())


lin_reg=LinearRegression()
scores=cross_val_score(lin_reg,housingTraining,housingLabel.values,scoring='neg_mean_squared_error',cv=10)
rms_scores=np.sqrt(-scores)
DisplayScores(rms_scores)

# lin_reg.fit(housingTraining,housingLabel)
# some_data=housingData.iloc[:50]
# some_data_to_predic=combinePipeline.fit_transform(some_data)
# some_label=housingLabel.iloc[:50].values
# some_data_to_predic
# predicted_values=lin_reg.predict(some_data_to_predic)
# print(predicted_values[:5],'\n',some_label[:5])

score :  [68469.09832551 69034.42044851 66377.52837338 68982.05960585
 67421.45808313 68900.13835611 72489.20120817 67821.26099999
 70340.17763808 68693.87187002]
mean :  68852.92149087616
std :  1577.9704665472154
