In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
housing_data=pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
housing_data.head()

In [None]:
housing_data.describe()

In [None]:
housing_data.columns

In [None]:
housing_data.shape

# Visualizing distributions of data


In [None]:
housing_data.hist(bins=50, figsize=(20,15))
plt.show()

# Visualizing boxplot to find data outliers

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(18, 10))

sns.boxplot(ax=axes[0, 0], data=housing_data, x='longitude')
sns.boxplot(ax=axes[0, 1], data=housing_data, x='latitude')
sns.boxplot(ax=axes[0, 2], data=housing_data, x='housing_median_age')
sns.boxplot(ax=axes[1, 0], data=housing_data, x='total_rooms')
sns.boxplot(ax=axes[1, 1], data=housing_data, x='total_bedrooms')
sns.boxplot(ax=axes[1, 2], data=housing_data, x='population')
sns.boxplot(ax=axes[2, 0], data=housing_data, x='households')
sns.boxplot(ax=axes[2, 1], data=housing_data, x='median_income')

# better understating the relation between features and the output price

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(18, 10))

sns.scatterplot(ax=axes[0, 0], data=housing_data, x='longitude', y ='median_house_value',hue='ocean_proximity')
sns.scatterplot(ax=axes[0, 1], data=housing_data, x='latitude', y ='median_house_value',hue='ocean_proximity')
sns.scatterplot(ax=axes[0, 2], data=housing_data, x='housing_median_age', y ='median_house_value',hue='ocean_proximity')
sns.scatterplot(ax=axes[1, 0], data=housing_data, x='total_rooms', y ='median_house_value',hue='ocean_proximity')
sns.scatterplot(ax=axes[1, 1], data=housing_data, x='total_bedrooms', y ='median_house_value',hue='ocean_proximity')
sns.scatterplot(ax=axes[1, 2], data=housing_data, x='population', y ='median_house_value',hue='ocean_proximity')
sns.scatterplot(ax=axes[2, 0], data=housing_data, x='households', y ='median_house_value',hue='ocean_proximity')
sns.scatterplot(ax=axes[2, 1], data=housing_data, x='median_income', y ='median_house_value',hue='ocean_proximity')

In [None]:
housing_data.corr(method ='pearson')

# data cleaing

In [None]:
for column in (housing_data.columns):
    print("null data in " , column , "= ",housing_data[column].isnull().sum())

In [None]:
housing_data=housing_data.dropna()
housing_data[column].isnull().sum()

In [None]:
housing_data["ocean_proximity"].unique()

In [None]:
housing_data["ocean_proximity"].value_counts()

In [None]:
sns.catplot(x='ocean_proximity',data=housing_data,kind="count")

In [None]:
housing_data["ocean_proximity"]=housing_data["ocean_proximity"].replace("NEAR BAY",0)
housing_data["ocean_proximity"]=housing_data["ocean_proximity"].replace("NEAR OCEAN",0)
housing_data["ocean_proximity"]=housing_data["ocean_proximity"].replace("<1H OCEAN",1)
housing_data["ocean_proximity"]=housing_data["ocean_proximity"].replace("INLAND",2)
housing_data["ocean_proximity"]=housing_data["ocean_proximity"].replace("ISLAND",3)

In [None]:
housing_data = housing_data.drop('ocean_proximity', axis=1)
#without droping this column the error is do big 67774.38988264417

# dropping outliers

In [None]:
filt=housing_data["households"]<6000
housing_data=housing_data.loc[filt]
housing_data.shape 

In [None]:
filt=housing_data["population"]<20000
housing_data=housing_data.loc[filt]
housing_data.shape 

In [None]:
filt=housing_data["total_bedrooms"]<6000
housing_data=housing_data.loc[filt]
housing_data.shape 

In [None]:
filt=housing_data["total_rooms"]<35000
housing_data=housing_data.loc[filt]
housing_data.shape 
# dropping the outliers gave a better resultes

# split the input and the output

In [None]:

X=housing_data.iloc[:,:9]
Y=housing_data.iloc[:,-1]
print(X.shape)
print(Y.shape)


# spliting the train_set and test_set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 20)
X_train.shape
X_train.head()

# scale the input 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)
X_test_scale

# using Ridge

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.01)
ridge_reg.fit(X_train_scale, Y_train)

In [None]:
Y_predict=ridge_reg.predict(X_test_scale)

In [None]:
score=ridge_reg.score(X_test_scale,Y_predict)
print("accuracy is : ",score*100 , "%")

In [None]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(Y_test, Y_predict)
print(np.sqrt(MSE)) 
#alpha=1  => 9.037097028992218e-10
#alpha=5  => 4.551486323293706e-09  # the best
#alpha=10 => 9.032141529746062e-09

In [None]:
Y_predict_train=ridge_reg.predict(X_train_scale)

In [None]:
MSE = mean_squared_error(Y_train, Y_predict_train)
print(np.sqrt(MSE)) # the model don't suffer from overfitting or under fitting

In [None]:
from sklearn.metrics import r2_score
print(r2_score(Y_test, Y_predict))

# using Lasso 

In [None]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=4.7)
lasso_reg.fit(X_train_scale, Y_train)

In [None]:
Y_predict=lasso_reg.predict(X_test_scale)

In [None]:
score=lasso_reg.score(X_test_scale,Y_predict)
print("accuracy is : ",score*100 , "%")

In [None]:
MSE = mean_squared_error(Y_test, Y_predict)
print(np.sqrt(MSE))
#alpha=1  => 0.1681449647887946
#alpha=5  => 0.09263927846218097
#alpha=9  => 0.09731598695208972
#alpha=10 => 0.08287387008903957 the best
#alpha=15 => 0.11404621947351543
#alpha=20 => 0.14574983885366918

In [None]:
Y_predict_train=lasso_reg.predict(X_train_scale)
MSE = mean_squared_error(Y_train, Y_predict_train)
print(np.sqrt(MSE)) # the model don't suffer from overfitting or under fitting

In [None]:
print(r2_score(Y_test, Y_predict))

In [None]:
print(r2_score(Y_train, Y_predict_train))

# using elastic

In [None]:
from sklearn.linear_model import ElasticNet
elastic_reg=ElasticNet(alpha=1, l1_ratio=1)
elastic_reg.fit(X_train_scale, Y_train)

In [None]:
Y_predict=elastic_reg.predict(X_test_scale)

In [None]:
score=elastic_reg.score(X_test_scale,Y_predict)
print("accuracy is : ",score*100 , "%")

In [None]:
MSE = mean_squared_error(Y_test, Y_predict)
print(np.sqrt(MSE))
#alpha=1  => 0.18001525491166995 ,ratio=0.5
#alpha=5  => 0.1333989080984809  ,ratio=0.5
#alpha=9  => 0.09738099019410122 ,ratio=0.5
#alpha=10 => 0.0924007783214709  ,ratio=0.5
#alpha=15 => 0.08677372255100453 ,ratio=0.5
#alpha=20 => 0.08504492603835609 ,ratio=0.5  
#alpha=21 => 0.08034563217845028 ,ratio=0.4
#alpha=22 => 0.08032991435000437 ,ratio=0.3  the best
#alpha=23 => 0.09430418539961244 ,ratio=0.2

In [None]:
Y_predict_train=elastic_reg.predict(X_train_scale)
MSE = mean_squared_error(Y_train, Y_predict_train)
print(np.sqrt(MSE)) # the model don't suffer from overfitting or under fitting

In [None]:
print(r2_score(Y_test, Y_predict))

In [None]:
print(r2_score(Y_train, Y_predict_train))

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=Y)

In [None]:
#xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = 1,
 #               max_depth = 13, alpha = 10, n_estimators = 17)

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = 0.9,
                max_depth = 7, alpha = 10, n_estimators =20)

In [None]:
xg_reg.fit(X_train_scale,Y_train)

Y_predict = xg_reg.predict(X_test_scale)

In [None]:
score=xg_reg.score(X_test_scale,Y_predict)
print("accuracy is : ",score*100,"%")

In [None]:
MSE = mean_squared_error(Y_test, Y_predict)
print(np.sqrt(MSE))

In [None]:
Y_predict_train=xg_reg.predict(X_train_scale)
MSE = mean_squared_error(Y_train, Y_predict_train)
print(np.sqrt(MSE)) # the model don't suffer from overfitting or under fitting

In [None]:
print(r2_score(Y_test, Y_predict))

In [None]:
print(r2_score(Y_train, Y_predict_train))

### . plain lnear regression is the normal regression that uses gradient decent and with no Regularizing term and i can use it with univariate linear regression
### . Ridge  linear regression that uses gradient decent and with Regularizing term = 1/2 sum(theta^2)  
### . Lasso linear regression that uses gradient decent and with Regularizing term = l1 norm
### . ElasticNet lasso linear regression that uses gradient decent and with Regularizing term that is mix of Ridge and Lasso and there is parameter to control it
## .Ridge is a good default, but if we suspect that only a few features are useful, we should prefer Lasso or Elastic Net because they tend to reduce the useless features’ weights down to zero,