In [None]:
import pandas as pd 
import numpy as np
import time 
import ast
import seaborn as sns 
import matplotlib as plt
%matplotlib inline

pd.set_option('display.max_rows', 5000) 
pd.set_option('display.max_columns', 5000)

In [None]:
df=pd.read_csv("housing.csv")

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.total_bedrooms.value_counts(dropna= False)

In [None]:
df[df.isna()==True].count()

In [None]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median())

In [None]:
df.total_bedrooms.value_counts (dropna= False)

In [None]:
df[df.isna() ==True].count()

In [None]:
df['total_bedrooms'].fillna (df['total_bedrooms'].median())

In [None]:
df[df.isna() ==True].count()

In [None]:
#plt.figure(figsize=(10, 12))

#g=sns.pairplot(df.loc[:,df.columns != 'ocean_proximity'], kind='bar') #g.fig.set_size_inches (30, 30)

In [None]:
df.hist(bins=30, figsize=(20,20))

In [None]:
df['rooms_per_household'] = df['total_rooms'].div(df['households'])
df['population_per_household']=df['population'].div(df['households'])
df['bedrooms_per_household'] = df['total_bedrooms'].div(df['households'])

#List (df.columns)

In [None]:
print([(i,df['median_house_value'].corr(df[i])) for i in list (df.columns) if i != 'ocean_proximity']) #population Lowering the prices while income has 0.6 correlation which is increasing the prices

In [None]:
df[['median_income', 'median_house_value']]

In [None]:
plt.figure.Figure(figsize=(20,20)) 
sns.jointplot(data=df,x='median_income',y='median_house_value', kind= "reg", height=6.27, marginal_ticks=False, marker="o")


In [None]:
import matplotlib.image as mpimg
california_img = mpimg.imread("california.png")

In [None]:
import matplotlib.pyplot as plt

df.plot(kind="scatter", x="longitude", y="latitude",
                       s=df.population/100, label="Population", figsize = (15, 10),
                       c="median_house_value", cmap="coolwarm",
                       colorbar=True, alpha=0.4, fontsize = 20, sharex = False)
                      
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))

plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.title("House Prices in California", fontsize = 20)
plt.legend(fontsize=16)
plt.show()

In [None]:
a25=np.percentile(df['median_income'], 25) 
a50=np.percentile (df['median_income'],50)
a75=np.percentile (df['median_income'], 75)
a90=np.percentile (df['median_income'],90)
a = (df['median_income'] <= a25)
b = (df['median_income']> a25) & (df['median_income'] <= a50) 
c = (df['median_income'] > a50) & (df['median_income'] <= a75)
d=(df['median_income'] > a75) & (df['median_income'] <= a90)
e =(df['median_income'] > a90)

conditions =[a,b,c,d,e] 
choices= ["low", "Below Average", "Above Average", "High", "Very High"]

df['income_cat']=np.select(conditions, choices, default=np.nan)

In [None]:
#pd.qcut() alternative to map

#sns.countplot(data=df,x='ocean_proximity', hue='median_income')

plt.figure(figsize=(15,15))

sns.countplot(data=df, x = "income_cat", hue="ocean_proximity")

In [None]:
plt.figure(figsize=(15,15))

sns.barplot(data=df, x='income_cat', y='median_house_value')

In [None]:
plt.figure(figsize= (15,15))

sns.barplot(data=df, x='ocean_proximity',y='median_house_value')

In [None]:
sns.heatmap(data=df.corr(), center=True)

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [None]:
df=pd.get_dummies(df, columns=['ocean_proximity'])

In [None]:
df.rename(columns={"ocean_proximity_<1H OCEAN": 'ocean_proximity less than 1H OCEAN'}, inplace=True)

In [None]:
df.columns

In [None]:
X=df[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
        'rooms_per_household', 'population_per_household',
       'bedrooms_per_household', 
       'ocean_proximity less than 1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN']] 
y=df['median_house_value']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train[X_train.isna() ==True].count()

In [None]:
df = df.reset_index()

In [None]:
model=LinearRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
X_test = np.nan_to_num(X_test)

In [None]:
pred=model.predict(X_test)

In [None]:
y_test = np.nan_to_num(y_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error,mean_squared_error

print(mean_absolute_error(y_test, pred))

print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfg = RandomForestRegressor(random_state=42)

In [None]:
rfg.get_params().keys()

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
n_estimators = [int(x) for x in np.arange(200, 1000, 50)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.arange(10, 100,5)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
random_grid

In [None]:
randomisedsearchcv =RandomizedSearchCV(estimator=rfg,param_distributions=random_grid,
                                      n_iter=100,verbose=3, cv=2, random_state=100,n_jobs=-1)

In [None]:

try: 
     randomisedsearchcv.fit(X_train,y_train) 
except KeyError as Key_error: 
    print(Key_error)

In [None]:
randomisedsearchcv.best_params_

In [None]:
randomisedsearchcv.best_params_['min_samples_leaf']

In [None]:
param_grid = {
              'max_depth': [randomisedsearchcv.best_params_['max_depth']], 
              'max_features': [randomisedsearchcv.best_params_['max_features']],
              'min_samples_leaf': [randomisedsearchcv.best_params_['min_samples_leaf'], 
                                   randomisedsearchcv.best_params_['min_samples_leaf']+2, 
                                   randomisedsearchcv.best_params_['min_samples_leaf']+1], 
              'min_samples_split': [randomisedsearchcv.best_params_['min_samples_split'],
                                    randomisedsearchcv.best_params_['min_samples_split']+1,
                                   randomisedsearchcv.best_params_['min_samples_split']-1],
              'n_estimators': [randomisedsearchcv.best_params_['n_estimators']+100, 
                               randomisedsearchcv.best_params_['n_estimators']-100,
                               randomisedsearchcv.best_params_['n_estimators']]
                    }

In [None]:
param_grid

In [None]:
gridsearch=GridSearchCV(estimator=rfg, param_grid=param_grid, cv=2,n_jobs=1, verbose=2)

In [None]:
gridsearch.fit(X_train,y_train)

In [None]:
best_grid=gridsearch.best_estimator_

In [None]:
best_grid

In [None]:
y_predict=best_grid.predict(X_test)

In [None]:
finaldf=pd.DataFrame(data={"Prediction": y_predict, "y_test":y_test})

In [None]:
print(np.sqrt(mean_squared_error(y_test,y_predict)))

In [None]:
finaldf