In [1]:
import pickle

import pandas as pd
from sqlalchemy import create_engine

from utils import db_utils

In [2]:
conn, engine = db_utils.connect()

***
### diabetes_diagnosis_resultsテーブル初期データ

In [43]:
sql = "DROP TABLE IF EXISTS diabetes_diagnosis_results;"
db_utils.execute(conn, sql)

In [44]:
sql = '''
    create table if not exists diabetes_diagnosis_results(
        id INT(10),
        pregnancies INT(10),
        glucose INT(10),
        blood_pressure FLOAT(10),
        skin_thickness FLOAT(10),
        insulin FLOAT(10),
        bmi FLOAT(10),
        diabetes_pedigree_function FLOAT(10),
        age INT(10),
        outcome INT(10),
        is_trained BOOL,
        primary key (id)
    ) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
'''
db_utils.execute(conn, sql)

In [3]:
columns = (
    'id', 'pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 
    'insulin', 'bmi', 'diabetes_pedigree_function', 'age', 'outcome', 'is_trained'
)
df1 = pd.read_csv('./data/init_train_data.csv', header='infer', names=columns)
print(df1.shape)
df1.head()

(1500, 11)


Unnamed: 0,id,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,outcome,is_trained
0,200,9,125,74.0,,,28.53691,0.444902,45,1,True
1,3832,4,109,80.0,,,28.047673,0.238243,22,0,True
2,4927,4,88,78.0,39.0,,52.371341,0.279471,26,0,True
3,4088,9,125,74.0,,,40.062688,0.203922,45,0,True
4,3644,5,107,78.0,44.0,284.0,52.935068,0.284959,45,1,True


In [46]:
df1.to_sql('diabetes_diagnosis_results', con=engine, if_exists="append", index=False)

In [4]:
df2 = pd.read_csv('./data/init_input_data.csv', header='infer', names=columns)
print(df2.shape)
df2.head()

(1000, 11)


Unnamed: 0,id,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,outcome,is_trained
0,907,2,134,80.0,,,27.259514,0.249494,29,0,False
1,3679,3,111,64.0,19.0,,39.285922,0.24292,24,0,False
2,3167,2,126,82.0,17.0,,27.969615,0.882624,27,0,False
3,1628,4,95,62.0,,,33.606792,0.198839,23,1,False
4,102,4,125,70.0,,,39.882894,0.268787,36,1,False


In [48]:
df2.to_sql('diabetes_diagnosis_results', con=engine, if_exists="append", index=False)

In [49]:
# 確認
sql = 'select is_trained, count(1) from diabetes_diagnosis_results group by is_trained;'
db_utils.fetch_all(conn, sql)

((0, 1000), (1, 1500))

In [50]:
# dfに読み込んで確認
df = pd.read_sql(sql="SELECT * FROM diabetes_diagnosis_results where is_trained = False;", con=conn)
df.head()

Unnamed: 0,id,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,outcome,is_trained
0,7,3,112,82.0,,,39.4627,0.491202,25,0,0
1,16,0,91,80.0,,,39.2735,0.191756,21,0,0
2,19,3,100,60.0,20.0,,34.0854,0.441405,22,0,0
3,28,0,100,82.0,28.0,,33.3631,0.213466,26,0,0
4,35,2,96,64.0,20.0,,38.0198,0.273569,26,0,0


※ MySQLだとbool型はTrueが1, Falseが０になる

***
### result_tempテーブル作成

In [36]:
sql = "DROP TABLE IF EXISTS results_temp;"
db_utils.execute(conn, sql)

In [38]:
sql = """
    CREATE TABLE results_temp(
        id INT(10),
        pregnancies INT(10),
        glucose INT(10),
        blood_pressure INT(10),
        skin_thickness INT(10),
        insulin INT(10),
        bmi FLOAT(10),
        diabetes_pedigree_function FLOAT(10),
        age INT(10),
        predict_result INTEGER,
        predict_probability FLOAT(10),
        primary key (id)
    );
    """
db_utils.execute(conn, sql)

***
### predict_resultテーブルに初期データを入れる

In [51]:
sql = "DROP TABLE IF EXISTS predict_results;"
db_utils.execute(conn, sql)

In [52]:
sql = """
    CREATE TABLE predict_results(
        id INT(10),
        predict_result INT(10),
        predict_probability FLOAT(10),
        true_result INT(10),
        model_id INT(10),
        primary key (id)
    );
    """
db_utils.execute(conn, sql)

In [5]:
# predict_resultにサンプルデータを挿入
imputer_id = "a959d262dfc95a584d17637673c23395"
model_id = "7d28bf6a69a6398a187e5c2c47cfdcba"
columns = ['id', 'pregnancies', 'glucose', 'blood_pressure', 'skin_thickness',
           'insulin', 'bmi', 'diabetes_pedigree_function', 'age', 'outcome']
df = df2[columns]
test_X, test_y = df.iloc[:, :-1], df.iloc[:, -1]
test_X.drop("id", axis=1, inplace=True)
imputer = pickle.load(open(f"./resources/deploy/model/imputer_{imputer_id}.pkl", "rb"))
test_X = pd.DataFrame(imputer.transform(test_X), columns=test_X.columns)
model = pickle.load(open(f"./resources/deploy/model/model_{model_id}.pkl", "rb"))
test_y_pred = model.predict(test_X)
test_y_proba = model.predict_proba(test_X)[:, 1]

In [6]:
df

Unnamed: 0,id,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,outcome
0,907,2,134,80.0,,,27.259514,0.249494,29,0
1,3679,3,111,64.0,19.0,,39.285922,0.242920,24,0
2,3167,2,126,82.0,17.0,,27.969615,0.882624,27,0
3,1628,4,95,62.0,,,33.606792,0.198839,23,1
4,102,4,125,70.0,,,39.882894,0.268787,36,1
...,...,...,...,...,...,...,...,...,...,...
995,753,2,112,76.0,,,40.265982,0.222552,24,0
996,1938,4,118,70.0,,,33.062790,0.299347,25,0
997,673,10,125,62.0,36.0,,39.083672,0.534258,21,1
998,2712,5,103,80.0,29.0,,33.848375,0.534160,28,0


In [7]:
predict_result_df = pd.DataFrame()
predict_result_df['id'] = df['id']
predict_result_df['predict_result'] = test_y_pred
predict_result_df['predict_probability'] = test_y_proba
predict_result_df['true_result'] = test_y
predict_result_df['model_id'] = model_id
predict_result_df.head()

Unnamed: 0,id,predict_result,predict_probability,true_result,model_id
0,907,0,0.077556,0,7d28bf6a69a6398a187e5c2c47cfdcba
1,3679,0,0.181595,0,7d28bf6a69a6398a187e5c2c47cfdcba
2,3167,0,0.120691,0,7d28bf6a69a6398a187e5c2c47cfdcba
3,1628,0,0.111972,1,7d28bf6a69a6398a187e5c2c47cfdcba
4,102,0,0.340563,1,7d28bf6a69a6398a187e5c2c47cfdcba


In [8]:
predict_result_df.to_csv('data/init_predict_result_data.csv', index=False, header=False)

In [57]:
predict_result_df.to_sql('predict_results', con=engine, if_exists='replace', index=False)

In [58]:
# 確認
sql = 'select * from predict_results limit 3;'
db_utils.fetch_all(conn, sql)

((907, 0, 0.0775557461009495, 0, '7d28bf6a69a6398a187e5c2c47cfdcba'),
 (3679, 0, 0.18159507396420693, 0, '7d28bf6a69a6398a187e5c2c47cfdcba'),
 (3167, 0, 0.12069084796470375, 0, '7d28bf6a69a6398a187e5c2c47cfdcba'))