# Machine Learning


Data is ready for ML

In [None]:
import pandas as pd
import numpy as np
import sklearn 

### Ma'lumotlarni o'qiymiz

In [None]:
# URL of online dataset
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

### Dividing data into **test** and **train**

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

## Making Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
#  indexes of the columns we need
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self 
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room column is optional
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

#### For columns with number

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())             
])

#### For columns with string

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

 (`full_pipeline`). 

 `.fit_transform()` method for calling this

In [None]:
X_prepared = full_pipeline.fit_transform(X_train)

In [None]:
X_prepared[0:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

Data is ready for ML

### Machine Learning

Using some of scikit-learn algorithms

#### Linear Regression 
Creating a new model using `LinearRegression` in `sklearn`

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared, y)

LinearRegression()

**THE END!** ML part is ended! 

Let's check

In [None]:
#  extracting 5 rows randomly
test_data = X_train.sample(5)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
6211,-117.89,34.07,35.0,834.0,137.0,392.0,123.0,4.5179,<1H OCEAN
14295,-117.13,32.72,52.0,1560.0,307.0,757.0,315.0,2.7083,NEAR OCEAN
12320,-116.51,33.89,21.0,1284.0,306.0,537.0,233.0,1.95,INLAND
19065,-122.46,38.29,21.0,2423.0,560.0,1098.0,503.0,2.364,NEAR BAY
16727,-120.76,35.52,7.0,9613.0,1666.0,4487.0,1653.0,3.6667,<1H OCEAN


In [None]:
# 
test_label = y.loc[test_data.index]
test_label

6211     218800.0
14295    199100.0
12320     61000.0
19065    173300.0
16727    250600.0
Name: median_house_value, dtype: float64

In [None]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 0.84378566, -0.73628606,  0.50719373, -0.83142678, -0.95823916,
        -0.90979159, -0.98958015,  0.33459385,  0.56350326,  0.00777575,
        -0.83784003,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.22272599, -1.36813087,  1.85617335, -0.49756934, -0.55250582,
        -0.58877756, -0.4855855 , -0.6157084 , -0.20225927, -0.0599208 ,
        -0.27698044,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.53186153, -0.82053203, -0.60373066, -0.62449035, -0.55489249,
        -0.78226547, -0.70083322, -1.01392579,  0.03162339, -0.06842401,
         0.43900995,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-1.43484237,  1.23881402, -0.60373066, -0.10071126,  0.05132085,
        -0.2888713 ,  0.00790925, -0.7965158 , -0.25892721, -0.0789452 ,
         0.31487031,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-0.58721269, -0.05763792, -1

Predict:

In [None]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([228852.74349408, 203387.28874123,  64236.23414498, 168843.70532715,
       241650.14389617])

In [None]:
pd.DataFrame({'Prognoz':predicted_data, 'Real baxosi': test_label})

Unnamed: 0,Prognoz,Real baxosi
6211,228852.743494,218800.0
14295,203387.288741,199100.0
12320,64236.234145,61000.0
19065,168843.705327,173300.0
16727,241650.143896,250600.0


### 5-QADAM. Model Test

In [None]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [None]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [None]:
y_test = test_set['median_house_value'].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [None]:
X_test_prepared = full_pipeline.transform(X_test)

Predicting

In [None]:
y_predicted = LR_model.predict(X_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72701.32600762139



### DecisionTree

In [None]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

DecisionTreeRegressor()

Model Test

In [None]:
y_predicted = Tree_model.predict(X_test_prepared)

In [None]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE 
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72562.00852508846


No big difference

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

RandomForestRegressor()

Model Test

In [None]:
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

50331.07889872079


Better than before

## Cross-Validation 



In [None]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

#### Cross-validation

In [None]:
from sklearn.model_selection import cross_val_score

#### LogisticRegression

In [None]:
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [None]:
display_scores(LR_rmse_scores)

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295096
Std.dev: 3694.713678722354


#### Decision Tree

In [None]:
scores = cross_val_score(Tree_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [117426.24650262  71236.93347566  84784.96821511  74123.27730087
  89750.15887077  76806.63059624  69589.05516706 102133.30525079
  95246.44610184  75049.59397392]
Mean: 85614.66154548783
Std.dev: 14765.492299059766


#### Random Forest

In [None]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [96963.10192753 47862.65729636 64981.93435835 56938.85521984
 60802.46090989 60535.50319764 46989.4400147  79484.26918866
 74040.3414757  49323.01936423]
Mean: 63792.158295289555
Std.dev: 15049.727081731291


## Saving the Model

For we can use `pickle` or `joblib` in Python


### `pickle` 

In [None]:
import pickle

filename = 'RF_model.pkl' # name the file
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

In [None]:
with open(filename, 'rb') as file:
    model = pickle.load(file)

In [None]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [76978.93742397 64329.3404101  61109.46641995 81775.30419816
 62027.72651085]
Mean: 69244.15499260693
Std.dev: 8476.579030000677


### `joblib` 


In [None]:
import joblib

filename = 'RF_model.jbl' # name the file
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [None]:
model = joblib.load(filename)

In [None]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [77460.56831558 63838.3153173  61358.53227924 80371.60644565
 62380.21986169]
Mean: 69081.84844389423
Std.dev: 8120.562299609714


Saving `pipeline` 

In [None]:
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']