In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [4]:
columns = 'age sex bmi map tc ldl hdl tch ltg glu'.split() 
diabetes = datasets.load_diabetes()
X = pd.DataFrame(diabetes.data, columns=columns) 
y = diabetes.target 

**Task #1**: cross_val_predict without sklearn

In [6]:
lr = linear_model.LinearRegression()
y_all = []
kf = KFold(n_splits=2)
kf.get_n_splits(X)
i=1
for train_index, test_index in kf.split(X):
    #print('TRAIN:', train_index, 'TEST:', test_index)
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    y_all.append(y_pred)
    i=i+1
#print(y_all)

**Task #2**: cross validation without sklearn

In [7]:
def cros_split(X,k):
    k=2
    folds = np.array_split(X,k)
    for i in range(k):
        train = folds.copy()
        test = folds[i]
        del train[i]
        train = pd.concat(train,sort=False)
    return train.index.values, test.index.values

train_index = cros_split(X,2)[0]
test_index = cros_split(X,2)[1]
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y[train_index], y[test_index]
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
y_all.append(y_pred)

#print(y_all)

**Task #3**: using scalers and checking the regression

In [8]:
df = pd.read_csv('Life Expectancy Data.csv')
df.head(2)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0


In [9]:
start = df.size
print('before dropping null values:',start)
df.isnull().sum().sum()
df.dropna(inplace=True)
end = df.size
print('after dropping null values',end)

before dropping null values: 64636
after dropping null values 36278


In [14]:
def training_model(X_train, X_test, y_train, y_test):
    lr = linear_model.LinearRegression()
    model = lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    
    print('Model score:',model.score(X_test,y_test))
    print ('Test RMSE:', np.sqrt(mean_squared_error(predictions, y_test)))
    print ('Train RMSE:', np.sqrt(mean_squared_error(lr.predict(X_train), y_train)))

In [15]:
y = df['Life expectancy ']
X = df.drop(['Country','Year','Status','Life expectancy '],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

training_model(X_train, X_test, y_train, y_test)

Model score: 0.840768623575649
Test RMSE: 3.599370345674451
Train RMSE: 3.57547700329713


In [16]:
mmscaler = MinMaxScaler().fit(X_train)
X_train_scaled = mmscaler.transform(X_train)
X_test_scaled = mmscaler.transform(X_test)

training_model(X_train_scaled, X_test_scaled, y_train, y_test)

Model score: 0.8407686235759935
Test RMSE: 3.599370345670557
Train RMSE: 3.575477003297131


In [17]:
sscaler = StandardScaler().fit(X_train)
X_train_sscaled = sscaler.transform(X_train)
X_test_sscaled = sscaler.transform(X_test)

training_model(X_train_sscaled, X_test_sscaled, y_train, y_test)

Model score: 0.8407686235759935
Test RMSE: 3.599370345670558
Train RMSE: 3.57547700329713


Applying get_dummies

In [19]:
df[['Developed','Developing']] = pd.get_dummies(df[['Status']])

In [20]:
df.drop('Status',inplace=True,axis=1)

In [21]:
df.head(2)

Unnamed: 0,Country,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,...,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Developed,Developing
0,Afghanistan,2015,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,...,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,0,1
1,Afghanistan,2014,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,...,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,0,1


In [22]:
y = df['Life expectancy ']
X = df.drop(['Country','Year','Life expectancy '],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

training_model(X_train_sscaled, X_test_sscaled, y_train, y_test)

Model score: -0.024750997246964612
Test RMSE: 8.875502053773126
Train RMSE: 8.756076470131374
