<a href="https://colab.research.google.com/github/ShohruhShokulov/mohirdev_datascience_AI/blob/main/Machine_learning_predicting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 4: Machine Learning

In [38]:
import numpy as np
import pandas as pd
import sklearn
URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
df = pd.read_csv(URL)

In [39]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

X_train = train_set.drop("median_house_value", axis = 1)
y = train_set['median_house_value'].copy()

X_num = X_train.drop("ocean_proximity", axis = 1)

- Pipeline

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
  def __init__(self, add_badrooms_per_room = True):
    self.add_badrooms_per_room = add_badrooms_per_room
  def fit(self, X, y = None ):
    return self
  def transform(self, X):
    rooms_per_household = X[:, rooms_ix]/X[:, households_ix]
    population_per_household = X[:, population_ix]/ X[:, households_ix]
    if self.add_badrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
      return np.c_[X,rooms_per_household,population_per_household, bedrooms_per_room]
    else:
      return np.c_[X,rooms_per_household,population_per_household]

- Pipeline for numerical columns

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder(add_badrooms_per_room= True)),
    ('std_scaler', StandardScaler())
])

- Pipeline for text columns

In [42]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [43]:
X_prepaired = full_pipeline.fit_transform(X_train)

In [44]:
X_prepaired

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

- Linear Regression

In [45]:
## Machine Learning (Linear Regression)
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [46]:
LR_model.fit(X_prepaired, y)

In [47]:
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
4008,-118.62,34.15,26.0,5661.0,791.0,2493.0,780.0,7.9814,<1H OCEAN
13264,-117.66,34.1,26.0,1855.0,553.0,1109.0,536.0,2.2429,INLAND
9782,-121.02,36.24,12.0,2198.0,507.0,1971.0,502.0,2.6801,<1H OCEAN
19778,-122.18,39.93,35.0,1387.0,272.0,610.0,237.0,2.1759,INLAND
15339,-117.31,33.18,16.0,1835.0,430.0,599.0,399.0,2.0147,NEAR OCEAN
5404,-118.44,34.03,37.0,1193.0,205.0,488.0,224.0,3.625,<1H OCEAN
17939,-121.94,37.34,41.0,2151.0,473.0,1092.0,469.0,3.7321,<1H OCEAN
18638,-121.98,36.99,14.0,6787.0,1454.0,3416.0,1357.0,3.5943,NEAR OCEAN
11522,-118.08,33.72,14.0,2021.0,396.0,696.0,367.0,7.1673,NEAR OCEAN
614,-122.15,37.74,49.0,1494.0,316.0,611.0,288.0,2.2,NEAR BAY


In [48]:
test_label = y.loc[test_data.index]
test_label

4008     409900.0
13264    150000.0
9782     100000.0
19778     59500.0
15339     87700.0
5404     357600.0
17939    250000.0
18638    262400.0
11522    340700.0
614      187500.0
Name: median_house_value, dtype: float64

In [49]:
test_data_prepaired = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepaired)

In [50]:
predicted_labels

array([380206.50977237, 132753.97870915, 152852.29558179,  74452.37881868,
       154453.0071191 , 216613.57095854, 254809.01476234, 256024.45670191,
       357575.01899516, 186344.56495804])

In [52]:
pd.DataFrame({'Prediction': predicted_labels, 'Original_value': test_label})

Unnamed: 0,Prediction,Original_value
4008,380206.509772,409900.0
13264,132753.978709,150000.0
9782,152852.295582,100000.0
19778,74452.378819,59500.0
15339,154453.007119,87700.0
5404,216613.570959,357600.0
17939,254809.014762,250000.0
18638,256024.456702,262400.0
11522,357575.018995,340700.0
614,186344.564958,187500.0


# Step 5 (Testing Model)

In [53]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [54]:
X_test = test_set.drop("median_house_value", axis = 1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [55]:
y_test = test_set['median_house_value'].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [56]:
X_test_prepaired = full_pipeline.transform(X_test)

In [60]:
y_predicted = LR_model.predict (X_test_prepaired)

In [61]:
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [62]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_predicted)
print("MAE = ",mae)

MAE =  50898.7395349408


In [63]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_predicted)
print("RMSE = ", np.sqrt(mse))

RMSE =  72701.32600762138


# Random Forest

In [64]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepaired, y)

In [65]:
y_predicted = RF_model.predict(X_test_prepaired)

In [66]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_predicted)
print("RMSE = ", np.sqrt(mse))

RMSE =  50395.49737473833


# Cross-Validation

In [67]:
X = df.drop("median_house_value", axis = 1)
y = df['median_house_value'].copy()

X_prepaired = full_pipeline.transform(X)

In [75]:
from sklearn.model_selection import cross_val_score

mse_scores = cross_val_score(RF_model, X_prepaired, y, scoring = "neg_mean_squared_error", cv = 5)

In [73]:
def display_scores(scores):
  print("Scores: ", scores)
  print("Mean: ", scores.mean())
  print("Std.dev: ", scores.std())

In [76]:
display_scores(np.sqrt(-mse_scores))

Scores:  [76964.06944884 64356.4555575  61356.34790095 82063.69549454
 62142.93821639]
Mean:  69376.70132364318
Std.dev:  8489.808495234385


# Step 6 (Presenting Model)

- Pickle

In [77]:
## Pickle
import pickle

filename = 'RF_model.pkl'
with open(filename, 'wb') as file:
  pickle.dump(RF_model, file)

In [80]:
with open(filename, 'rb') as file:
  model = pickle.load(file)

- Joblib

In [81]:
import joblib

filename = 'LR_model.jlb'
joblib.dump(LR_model, filename)

['LR_model.jlb']

In [82]:
model = joblib.load(filename)

In [83]:
mse_scores = cross_val_score(model, X_prepaired, y, scoring = "neg_mean_squared_error", cv = 5)
display_scores(np.sqrt(-mse_scores))

Scores:  [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean:  73289.27323295092
Std.dev:  3694.713678722368
