In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn import datasets

In [4]:
data = datasets.load_diabetes()
data

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [5]:
data.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [6]:
for line in data['DESCR'].split('\n'):
    if len(line)!=0:
        print(line)

.. _diabetes_dataset:
Diabetes dataset
----------------
Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.
**Data Set Characteristics:**
:Number of Instances: 442
:Number of Attributes: First 10 columns are numeric predictive values
:Target: Column 11 is a quantitative measure of disease progression one year after baseline
:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level
Note: Each of these 10 feature variables have been mean c

In [7]:
x = data.data #independent variable

In [8]:
y = data['target'] #dependent variable

In [9]:
feature = data['feature_names']

In [10]:
data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler((0,1))
x = scaler.fit_transform(x)
x

array([[0.66666667, 1.        , 0.58264463, ..., 0.28208745, 0.562217  ,
        0.43939394],
       [0.48333333, 0.        , 0.14876033, ..., 0.14104372, 0.22243673,
        0.16666667],
       [0.88333333, 1.        , 0.51652893, ..., 0.28208745, 0.49657763,
        0.40909091],
       ...,
       [0.68333333, 1.        , 0.28512397, ..., 0.24964739, 0.30503001,
        0.56060606],
       [0.28333333, 0.        , 0.49586777, ..., 0.39351199, 0.65702552,
        0.40909091],
       [0.28333333, 0.        , 0.0661157 , ..., 0.14104372, 0.46930394,
        0.51515152]])

In [12]:
df = pd.DataFrame(x,columns=feature)
df['output']= y
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,output
0,0.666667,1.0,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394,151.0
1,0.483333,0.0,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222437,0.166667,75.0
2,0.883333,1.0,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496578,0.409091,141.0
3,0.083333,0.0,0.301653,0.309859,0.495098,0.447211,0.233766,0.423131,0.572923,0.469697,206.0
4,0.516667,0.0,0.206612,0.549296,0.465686,0.417331,0.38961,0.282087,0.362385,0.333333,135.0


In [13]:
df.tail(2)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,output
440,0.283333,0.0,0.495868,0.464789,0.509804,0.416335,0.25974,0.393512,0.657026,0.409091,220.0
441,0.283333,0.0,0.066116,0.126761,0.75,0.456175,0.974026,0.141044,0.469304,0.515152,57.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  output  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [15]:
df.shape

(442, 11)

## Data preprocessing

In [16]:
#missing values
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,output
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,0.491968,0.468326,0.346107,0.459817,0.451668,0.367725,0.360889,0.291996,0.48556,0.503942,152.133484
std,0.218484,0.499561,0.182567,0.194807,0.169647,0.15146,0.167977,0.18201,0.183366,0.174187,77.093005
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0
25%,0.320833,0.0,0.214876,0.309859,0.329657,0.271165,0.237013,0.141044,0.357542,0.382576,87.0
50%,0.516667,0.0,0.318182,0.43662,0.436275,0.355578,0.337662,0.282087,0.478062,0.5,140.5
75%,0.666667,1.0,0.465909,0.605634,0.552696,0.462649,0.464286,0.423131,0.610446,0.606061,211.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,346.0


In [17]:
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
       'output'],
      dtype='object')

In [18]:
q1 = np.quantile(df['age'],0.25)
q3 = np.quantile(df['age'],0.75)

iqr = q3-q1

lb = q1 -  iqr*1.5
ub = q3 +  iqr*1.5

outlair1 = np.where(df['age']<lb)
outlair2 = np.where(df['age']>ub)

In [19]:
for i in df.columns:
    q1 = np.quantile(df[i],0.25)
    q3 = np.quantile(df[i],0.75)

    iqr = q3-q1

    lb = q1 -  iqr*1.5
    ub = q3 +  iqr*1.5

    outlair = np.where(((df[i]<lb)|(df[i]>ub)))
    for index in outlair[0]:
        df[i].iloc[index]=None

df.head()


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[i].iloc[index]=None
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because 

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,output
0,0.666667,1.0,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394,151.0
1,0.483333,0.0,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222437,0.166667,75.0
2,0.883333,1.0,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496578,0.409091,141.0
3,0.083333,0.0,0.301653,0.309859,0.495098,0.447211,0.233766,0.423131,0.572923,0.469697,206.0
4,0.516667,0.0,0.206612,0.549296,0.465686,0.417331,0.38961,0.282087,0.362385,0.333333,135.0


In [20]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,output
count,442.0,442.0,439.0,442.0,434.0,435.0,435.0,440.0,438.0,433.0,442.0
mean,0.491968,0.468326,0.342015,0.459817,0.442871,0.360435,0.351829,0.289037,0.480874,0.50056,152.133484
std,0.218484,0.499561,0.176257,0.194807,0.1581,0.140874,0.153022,0.176988,0.17748,0.161411,77.093005
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121212,25.0
25%,0.320833,0.0,0.21281,0.309859,0.328431,0.269422,0.233766,0.141044,0.357542,0.393939,87.0
50%,0.516667,0.0,0.318182,0.43662,0.436275,0.353586,0.337662,0.282087,0.472849,0.5,140.5
75%,0.666667,1.0,0.46281,0.605634,0.542892,0.452689,0.454545,0.423131,0.605672,0.606061,211.5
max,1.0,1.0,0.838843,1.0,0.877451,0.74004,0.792208,0.846262,0.986451,0.939394,346.0


In [21]:
df.dropna(inplace=True)

In [22]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,output
count,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0
mean,0.491809,0.472019,0.340442,0.45878,0.436942,0.360258,0.355342,0.283567,0.474301,0.497272,149.970803
std,0.221149,0.499825,0.174802,0.190778,0.156389,0.140514,0.15305,0.16917,0.175137,0.161952,75.772954
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121212,25.0
25%,0.333333,0.0,0.214876,0.323944,0.323529,0.270916,0.246753,0.141044,0.352627,0.378788,85.5
50%,0.516667,0.0,0.318182,0.43662,0.431373,0.355578,0.337662,0.282087,0.465759,0.5,139.0
75%,0.666667,1.0,0.454545,0.593873,0.534314,0.450697,0.454545,0.423131,0.594686,0.606061,202.0
max,1.0,1.0,0.838843,1.0,0.843137,0.74004,0.792208,0.846262,0.986451,0.939394,341.0


In [23]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1]
y.shape

(411,)

In [24]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler((0,1))

x = scaler.fit_transform(np.array(x))
y = scaler.fit_transform(np.array(y).reshape(-1,1))

In [25]:
df.loc[:,'bp']

0      0.549296
1      0.352113
2      0.436620
3      0.309859
4      0.549296
         ...   
436    0.253521
437    0.704225
438    0.183099
439    0.530563
440    0.464789
Name: bp, Length: 411, dtype: float64

Cross validation

In [36]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [37]:
from sklearn.model_selection import KFold,cross_val_score

vali = KFold(8)

result = cross_val_score(model,x,y,cv = vali)

In [39]:
result.mean()

0.4422671397321718

In [27]:
from sklearn.model_selection import train_test_split as tts

x_train,x_test,y_train,y_test = tts(x,y,train_size=0.8)

In [28]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(x_train,y_train)


In [29]:
model.coef_

array([[-0.0241494 , -0.07463856,  0.33146311,  0.28430756, -0.15743588,
        -0.00799602, -0.08804404,  0.19153545,  0.4098723 ,  0.02123294]])

In [30]:
y_test_pred = model.predict(x_test)
y_train_pred = model.predict(x_train)

In [31]:
x_train.shape[1]

10

In [32]:
from sklearn.metrics import r2_score as r2

#for calculating adjusted R2 score
def adj_r2(act,pred,r=x_train.shape[1]):

    return 1 - ((1-r2(act,pred))*(act.shape[0]-1))/(act.shape[0]-r-1)

print("Adj. R2 Score:",adj_r2(y_train,y_train_pred))

Adj. R2 Score: 0.5025608807722602


In [33]:
print("Adj. R2 Score:",adj_r2(y_test,y_test_pred))

Adj. R2 Score: 0.29083000717302476


In [34]:
# multiple linear regression model is not good