In [53]:
import pandas as pd
import json
import sqlite3
import numpy as np
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [54]:
#Get data table for training, this is the output of the pipeline, hence the name 'regression_table'
dbName = "../../Project1/data/data_new1.db"
tableName = "data_clean1"

dbConnection = sqlite3.connect(dbName)

# # #We doen een query en maken hier een pandas dataframe van
df = pd.read_sql_query(f"SELECT * FROM {tableName}", dbConnection)

# # #We sluiten de connectie
dbConnection.close()

# test = pd.read_sql_query(f".tables", dbConnection)

# print(test)
                         

In [55]:
#Selecting subset of complete dataframe containing only "relevant" variables:

df_updated = df[['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI', 'lifespan']]

In [56]:
#Making variations of the dataframe for comparison of different models:

#All parameters
v0 = ['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI']

#
v1 = ['genetic', 'exercise', 'smoking', 'BMI']

#
v2 = ['genetic', 'smoking']

#
v3 = ['genetic', 'exercise', 'smoking']

#
v4 = ['genetic', 'exercise', 'smoking', 'alcohol', 'sugar']

version_list = [v0,v1,v2,v3,v4]

In [39]:
print(version_list)

[['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI'], ['genetic', 'exercise', 'smoking', 'BMI'], ['genetic', 'smoking'], ['genetic', 'exercise', 'smoking'], ['genetic', 'exercise', 'smoking', 'alcohol', 'sugar']]


In [57]:
models = []

def train_model(dataframe, version):
    
    x = dataframe[version]
    y = dataframe.loc[:, 'lifespan']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
    
    model = LinearRegression()
    model.fit(x_train, y_train)
    
    predictions = model.predict(x_test)
    
    model_m_sqe = mean_squared_error(y_test, predictions)
    model_m_abse = mean_absolute_error(y_test, predictions)
    rmse = math.sqrt(model_m_sqe)
    r2 = r2_score(y_test, predictions)
    coefs = dict(list(zip(x.columns, model.coef_)))
    intercept = model.intercept_
    
    print(f'Mean squared error: {model_m_sqe}')
    print(f'Mean absolute error: {model_m_abse}')
    print(f'R-squared value: {r2}', '\n')
    print(f'Model coefficients: ')
    for c in coefs:
        print(f'{c[0]}: {c[1]}')
    print('\n')
    print(f'Model intercept: {intercept}')
    
    return {
            'model version': ', '.join(version),
            'mean squared error': model_m_sqe,
            'mean absolute error': model_m_abse,
            'r squared': r2,
            'root mean squared error': rmse,
            'coefficients': coefs,
            'intercept': intercept
    }


    
    
    

In [42]:
print(df_updated)

      genetic  exercise  smoking  alcohol  sugar   BMI  lifespan
0        73.9       0.9      0.0      2.4    6.9  29.1      73.1
1        86.0       1.8      8.1      0.4    4.2  35.6      85.0
2        83.3       1.1      0.8      4.6    7.5  36.0      81.6
3        82.8       4.7     11.8      1.0    2.9  41.4      81.0
4        78.7       1.5      8.3      4.9    5.5  22.6      75.0
...       ...       ...      ...      ...    ...   ...       ...
4079     80.3       3.7      0.1      5.6    5.8  37.4      80.7
4080     75.3       3.6      6.2      5.6    7.6  49.4      71.1
4081     93.1       2.7      9.4      4.9    6.6  32.3      90.0
4082     77.9       2.3     13.2      0.8    6.2  31.4      75.9
4083    101.3       1.2      8.2      6.0    6.2  36.6      96.9

[4084 rows x 7 columns]


In [48]:
#Train model for different datasets
for v in version_list:
    print(train_model(df_updated, v))
    
    models.append(       
        train_model(df_updated, v)
    )
    

Mean squared error: 1.1915421266947106
Mean absolute error: 0.8196017154751907
R-squared value: 0.9808404760702363 

Model coefficients: 
g: e
e: x
s: m
a: l
s: u
B: M


Model intercept: 1.3527339453648892
{'model version': 'genetic, exercise, smoking, alcohol, sugar, BMI', 'mean squared error': 1.1915421266947106, 'mean absolute error': 0.8196017154751907, 'r squared': 0.9808404760702363, 'root mean squared error': 1.0915778152265236, 'coefficients': {'genetic': 1.0023175161386837, 'exercise': 0.8159422239679683, 'smoking': -0.263259639397799, 'alcohol': -0.23536178137796426, 'sugar': -0.07330642550711564, 'BMI': -0.07230117820131232}, 'intercept': 1.3527339453648892}
Mean squared error: 1.1915421266947106
Mean absolute error: 0.8196017154751907
R-squared value: 0.9808404760702363 

Model coefficients: 
g: e
e: x
s: m
a: l
s: u
B: M


Model intercept: 1.3527339453648892
Mean squared error: 1.338032043951273
Mean absolute error: 0.8886669414760978
R-squared value: 0.9784849764095304 



In [67]:
#Printing all versions of our model
for d in models:
    print(f"Variables included in model: {d['model version']}.")
    print(f"Mean squared error of model: {d['mean squared error']}.")
    print(f"Mean absolute error of model: {d['mean absolute error']}.")
    print(f"R-squared of model: {d['r squared']}.")
    print(f"Root mean squared error: {d['root mean squared error']}.")
    print(f"Model coefficients: ")
    
coefs = dict(list(zip(x.columns, model.coef_)))    

for v in coefs:
        print (v[0], v[1])
print(f"Model intercept: {d['intercept']}")
print("\n")

Variables included in model: genetic, exercise, smoking, alcohol, sugar, BMI.
Mean squared error of model: 1.1915421266947106.
Mean absolute error of model: 0.8196017154751907.
R-squared of model: 0.9808404760702363.
Root mean squared error: 1.0915778152265236.
Model coefficients: 
Variables included in model: genetic, exercise, smoking, BMI.
Mean squared error of model: 1.338032043951273.
Mean absolute error of model: 0.8886669414760978.
R-squared of model: 0.9784849764095304.
Root mean squared error: 1.1567333504102286.
Model coefficients: 
Variables included in model: genetic, smoking.
Mean squared error of model: 2.633203189435365.
Mean absolute error of model: 1.2630220744385878.
R-squared of model: 0.9576591390353393.
Root mean squared error: 1.6227147591105977.
Model coefficients: 
Variables included in model: genetic, exercise, smoking.
Mean squared error of model: 1.8059692529649345.
Mean absolute error of model: 1.0264418601752425.
R-squared of model: 0.9709607320266701.
Root

NameError: name 'x' is not defined

In [44]:
#modellen exporteren naar pickle bestand (?)

with open('trained models.json', 'w') as file:
    json.dump(models, file)

In [47]:
#print(models.coef_)