In [23]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

In [24]:
data = load_diabetes()
data

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [25]:
data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [26]:
data.target

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [27]:
df = pd.DataFrame(data.data, columns = data.feature_names  )
df["Diabetes Exist"] = data.target
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Diabetes Exist
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             442 non-null    float64
 1   sex             442 non-null    float64
 2   bmi             442 non-null    float64
 3   bp              442 non-null    float64
 4   s1              442 non-null    float64
 5   s2              442 non-null    float64
 6   s3              442 non-null    float64
 7   s4              442 non-null    float64
 8   s5              442 non-null    float64
 9   s6              442 non-null    float64
 10  Diabetes Exist  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [29]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]


In [30]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=43, test_size=0.3)

In [31]:
rf_regressor = RandomForestRegressor(1000)
rf_regressor.fit(xtrain,ytrain)

In [32]:
ypred = rf_regressor.predict(xtest)
ypred

array([152.985,  94.278, 171.263, 255.982, 159.365,  95.91 , 230.946,
       129.994, 123.741, 228.45 ,  68.925, 237.328, 103.074,  81.345,
       127.212, 196.627, 198.639, 205.697, 212.192, 159.705, 204.762,
       115.624, 263.601, 244.343, 119.533,  82.695, 213.933,  90.178,
        86.541, 168.839, 212.244,  81.188, 151.096, 118.475, 160.754,
       130.018,  94.952, 115.654,  84.159, 104.2  , 142.443, 175.268,
       126.103, 166.012, 269.794, 116.889,  90.611, 264.772,  97.28 ,
       115.081, 226.464, 235.317, 180.35 ,  65.676,  94.252, 171.827,
       137.65 , 176.704, 271.372, 247.571, 213.755, 178.35 , 107.594,
       160.284, 246.423, 144.068, 123.23 ,  95.475,  73.621, 286.718,
        96.041, 210.493, 107.096, 155.095,  90.151, 128.273,  86.   ,
       222.288,  80.076, 232.082,  89.606, 163.074, 201.508, 188.083,
       169.234, 164.343, 185.756, 140.513, 115.057, 217.449, 141.124,
       165.677,  93.199, 111.497, 203.686, 170.07 ,  95.255, 152.086,
       231.545, 186.

In [33]:
from sklearn.metrics import mean_squared_error
cost = mean_squared_error(ypred,ytest)
cost

3727.9452480225564