# Simple Machine Learning Code Tutorial for Beginners with Sklearn

https://www.youtube.com/watch?v=-IvNzmrcyUM

In [1]:
# uv init ml_proj
# uv add scikit-learn
# uv run --with  jupyter jupyter lab

In [2]:
from  sklearn import datasets 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
import joblib

In [3]:
housing = datasets.fetch_california_housing()

In [3]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [4]:
X = housing.data

In [5]:
y = housing.target

In [7]:
print(housing.feature_names)
print(X[0])
print(y[0])

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
4.526


In [8]:
# from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=432)

In [9]:
print(housing.feature_names)
print(X_train[0])
print(y_train[0])

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
[   2.1442       52.            3.94886364    1.03693182  921.
    2.61647727   37.34       -121.88      ]
1.889


In [10]:
# from sklearn.linear_model import LinearRegression  

model = LinearRegression()
model.fit(X_train, y_train) 

y_pred = model.predict(X_test)

In [11]:
# from sklearn.metrics import r2_score   
r2 = r2_score(y_test, y_pred)
print(r2)

0.6080229586580346


Baseline 60%

In [12]:
# from sklearn.preprocessing import PolynomialFeatures   

print(X.shape)
poly = PolynomialFeatures()
X = poly.fit_transform(X) 
print(X.shape)

(20640, 8)
(20640, 45)


In [13]:
# from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor  
LR = LinearRegression()
GBR = GradientBoostingRegressor()
RFR = RandomForestRegressor(n_jobs=-1)
# RFR = RandomForestRegressor(n_jobs=-1) 
# -1 use all cores 
# 1 use only one core, 5-use 5 cores

In [14]:
for i in [LR, GBR, RFR]: 
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(i, r2)

LinearRegression() 0.6080229586580346
GradientBoostingRegressor() 0.7878126561044645
RandomForestRegressor(n_jobs=-1) 0.8117447228289669


In [15]:
for i in [LR, GBR, RFR]: 
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(i, r2)

LinearRegression() 0.6080229586580346
GradientBoostingRegressor() 0.7878126561044645
RandomForestRegressor(n_jobs=-1) 0.8133952446545532


In [16]:
# Better version of GradientBoostingRegressor 
# from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
HGBR = HistGradientBoostingRegressor()

In [17]:
for i in [100, 150, 200,250, 300, 350, 400, 450, 500]: 
    model =HistGradientBoostingRegressor(max_iter=i) 
    model.fit(X_train, y_train) 
    y_pred = model.predict(X_test) 
    r2 = r2_score(y_test, y_pred) 
    print(i, r2)

100 0.8377250415185991
150 0.841552196622102
200 0.841840586516328
250 0.8423016652861712
300 0.8457792652269542
350 0.8398773661488558
400 0.8438909492019762
450 0.8429458747317552
500 0.839448972998493


### Hyper parameterization

In [18]:
# Learning rate  0.1, 0.01, 0.001

for j in [0.1, 0.05, 0.001]:
    for i in [200,250,300,350,400]: 
        model =HistGradientBoostingRegressor(max_iter=i, learning_rate=j) 
        model.fit(X_train, y_train) 
        y_pred = model.predict(X_test) 
        r2 = r2_score(y_test, y_pred) 
        print(i, j, r2)

200 0.1 0.8425153560862981
250 0.1 0.8379465186565593
300 0.1 0.841476208300911
350 0.1 0.8458201153057721
400 0.1 0.8396594993331298
200 0.05 0.8368901167924542
250 0.05 0.8378268945766411
300 0.05 0.8404408352733663
350 0.05 0.8458422073017402
400 0.05 0.8432682449507686
200 0.001 0.2210989798675259
250 0.001 0.26111687873952794
300 0.001 0.30067698311225166
350 0.001 0.3358296999587095
400 0.001 0.3706033092326828


In [19]:
# import joblib 
model = HistGradientBoostingRegressor(max_iter=350, learning_rate=0.05) 
model.fit(X_train, y_train) 

joblib.dump(model, "my_model.joblib")

y_pred = model.predict(X_test) 
r2 = r2_score(y_test, y_pred) 
print(r2)

0.8420068818219365


In [20]:
local_model = joblib.load("my_model.joblib")

y_pred = local_model.predict(X_test) 
r2 = r2_score(y_test, y_pred)

print(r2)

0.8420068818219365
