<a href="https://colab.research.google.com/github/Shreya-i/CSS_miniproject/blob/main/California_Houses_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Name:Shreya Ingle
Roll no. : 21102A0041

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
housing = pd.read_csv(r"/kaggle/input/california-housing-prices/housing.csv")
housing.head(10)

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
housing.nunique()

In [None]:
housing.isnull().sum()

In [None]:
housing.hist(bins=50,figsize=(20,15))


In [None]:
housing['income_category'] = pd.cut(housing['median_income'], bins = [0,1.5,3,4.5,6, np.inf], labels=[1,2,3,4,5])

housing['income_category'].hist()

In [None]:
import urllib.request
import io
import matplotlib.image as mpimg

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
filename = "california.png"
print("Downloading", filename)
url = DOWNLOAD_ROOT + "images/end_to_end_project/" + filename
with urllib.request.urlopen(url) as url_request:
    image_data = url_request.read()

image_data = io.BytesIO(image_data)
california_img = mpimg.imread(image_data, format='png')

In [None]:
ax = housing.plot(kind='scatter', x='longitude',y='latitude', figsize=(10,7),s=housing['population']/100,
                  c='median_house_value',colorbar=False,cmap=plt.get_cmap('jet'),alpha=0.4)

plt.imshow(california_img,alpha=0.8, extent=[-124.55, -113.80, 32.45, 42.05], cmap=plt.get_cmap('jet'))
plt.xlabel('Latitude',fontsize=14)
plt.ylabel('Longitude',fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)
#plt.legend(fontsize=16)

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix['median_house_value'].sort_values(ascending=False)


In [None]:
sns.heatmap(corr_matrix,annot=True,cmap='Blues')

In [None]:
housing_eda = housing.copy()

In [None]:
housing_eda['rooms_per_household'] = housing_eda['total_rooms'] / housing_eda['households']
housing_eda['bedrooms_per_room'] = housing_eda['total_bedrooms'] / housing_eda['total_rooms']
housing_eda['population_per_houshold'] = housing_eda['population'] / housing_eda['households']

In [None]:
corr_matrix = housing_eda.corr(numeric_only=True)
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from sklearn.model_selection import train_test_split

x = housing.drop(columns='median_house_value')
y = housing['median_house_value']

X_train,X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.2)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True, add_rooms_per_household=True, add_population_per_household=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.add_rooms_per_household = add_rooms_per_household
        self.add_population_per_household = add_population_per_household
    def fit(self, X, y=None):
        return self
    def transform(self,X, y=None):
        if self.add_bedrooms_per_room==1:
            bedrooms_per_room=X[:, bedrooms_ix] / X[:, rooms_ix]
            bedrooms_per_room=np.log(bedrooms_per_room+1)
            X=np.c_[X, bedrooms_per_room]
        if self.add_rooms_per_household==1:
            rooms_per_household=X[:, rooms_ix] / X[:, households_ix]
            rooms_per_household=np.log(rooms_per_household+1)
            X=np.c_[X, rooms_per_household]
        if self.add_population_per_household==1:
            population_per_household=X[:, population_ix] / X[:, households_ix]
            population_per_household=np.log(population_per_household+1)
            X=np.c_[X, population_per_household]
            X[rooms_ix]=np.log(X[rooms_ix]+1)
            X[bedrooms_ix]=np.log(X[bedrooms_ix]+1)
            X[population_ix]=np.log(X[population_ix]+1)
            X[households_ix]=np.log(X[households_ix]+1)
        return X

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline= Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attrib_adder', CombinedAttributeAdder(add_bedrooms_per_room=True, add_rooms_per_household=True, add_population_per_household=False)),
    ('std_scaler', StandardScaler()),
])

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
num_attribs = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_attribs=['ocean_proximity']
full_pipline=ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])
processed_X_train=full_pipline.fit_transform(X_train)
processed_X_test =full_pipline.transform(X_test)

**Linear regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
reg=LinearRegression()
reg.fit(processed_X_train, Y_train)


scores = cross_val_score(reg,processed_X_train,Y_train, scoring='neg_mean_squared_error',cv=10)
print('cross validation scores :',np.sqrt(-scores).mean(),'\n')

In [None]:
from sklearn.metrics import accuracy_score

train_score = reg.score(processed_X_train,Y_train)
test_score = reg.score(processed_X_test,Y_test)

print('Linear regression score: \n')
print('Train score : ',round(train_score*100),'%')
print('Test score : ',round(test_score*100),'%')