In [None]:
import pickle
import warnings

import pandas as pd
import psycopg2

from utils import ml_workflow, db

warnings.filterwarnings("ignore")

In [4]:
conn, engine = db.connect('dev')

In [None]:
conn.close()

In [5]:
# diabetes_diagnosis_resultsテーブル作成

In [6]:
sql = """
    DROP TABLE IF EXISTS diabetes_diagnosis_results;
    CREATE TABLE diabetes_diagnosis_results(
        index INTEGER,
        pregnancies INTEGER,
        glucose INTEGER,
        blood_pressure INTEGER,
        skin_thickness INTEGER,
        insulin INTEGER,
        bmi NUMERIC,
        diabetes_pedigree_function NUMERIC,
        age INTEGER,
        outcome INTEGER,
        is_trained BOOLEAN,
        primary key (index)
    );
    """
db.execute(conn, sql)

In [5]:
# diabetes_diagnosis_resultsテーブルにデータロード
table_name = 'diabetes_diagnosis_results'
file_path = './data/train1.csv'
columns = (
    'index', 'pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 
    'insulin', 'bmi', 'diabetes_pedigree_function', 'age', 'outcome'
)
src_df = pd.read_csv(file_path, names=columns)
src_df['is_trained'] = True

In [9]:
src_df.to_sql(table_name, con=engine, if_exists="append", index=False)

In [6]:
src_df2 = pd.read_csv('./data/train2.csv', names=columns)
src_df2['is_trained'] = False
src_df2.to_sql(table_name, con=engine, if_exists="append", index=False)

In [17]:
# diabetes_diagnosis_resultsテーブルの件数確認
sql = """
    select count(1) from diabetes_diagnosis_results;
"""
db.fetch_all(conn, sql)

[(2001,)]

In [16]:
# predict_resultsテーブル作成
sql = """
    DROP TABLE IF EXISTS predict_results;
    CREATE TABLE predict_results(
        index INTEGER,
        predict_result INTEGER,
        predict_probability NUMERIC,
        true_result INTEGER,
        model_id INTEGER,
        primary key (index)
    );
    """
db.execute(conn, sql)

In [7]:
# predict_resultにサンプルデータを挿入
id = '20220109065731'
df = pd.read_csv('./data/train2.csv', header=None)
columns = ['index', 'pregnancies', 'glucose', 'blood_pressure', 'skin_thickness',
           'insulin', 'bmi', 'diabetes_pedigree_function', 'age', 'outcome']
df.columns = columns
test_X, test_y = df.iloc[:, :-1], df.iloc[:, -1]
test_X.drop("index", axis=1, inplace=True)
test_X = ml_workflow.transrate_abnormal_value(test_X)
imputer = pickle.load(open(f"./deploy/{id}/models/imputer.pkl", "rb"))
test_X = pd.DataFrame(imputer.transform(test_X), columns=test_X.columns)
model = pickle.load(open(f"./deploy/{id}/models/mlmodel.pkl", "rb"))
test_y_pred = model.predict(test_X)
test_y_proba = model.predict_proba(test_X)[:, 1]

In [8]:
predict_result_df = pd.DataFrame()
predict_result_df['index'] = df['index']
predict_result_df['predict_result'] = test_y_pred
predict_result_df['predict_probability'] = test_y_proba
predict_result_df['true_result'] = test_y
predict_result_df['model_id'] = id
predict_result_df.head()

Unnamed: 0,index,predict_result,predict_probability,true_result,model_id
0,2181,1,0.51774,1,20220109065731
1,1035,0,0.188494,0,20220109065731
2,3537,0,0.178858,0,20220109065731
3,602,0,0.293649,0,20220109065731
4,3626,0,0.290876,1,20220109065731


In [None]:
predict_result_df.to_sql('predict_results', con=engine, if_exists='replace', index=False)

In [18]:
# 確認
sql = """
    select count(1) from predict_results;
"""
db.fetch_all(conn, sql)

[(1001,)]

In [8]:
# results_tempテーブル作成
sql = """
    DROP TABLE IF EXISTS results_temp;
    CREATE TABLE results_temp(
        index INTEGER,
        pregnancies INTEGER,
        glucose INTEGER,
        blood_pressure INTEGER,
        skin_thickness INTEGER,
        insulin INTEGER,
        bmi NUMERIC,
        diabetes_pedigree_function NUMERIC,
        age INTEGER,
        predict_result INTEGER,
        predict_probability NUMERIC,
        primary key (index)
    );
    """
db.execute(conn, sql)

In [4]:
# 確認
sql = """
    select count(1) from results_temp;
"""
db.fetch_all(conn, sql)

[(0,)]

In [23]:
conn.close()

In [9]:
temp_df = pd.read_sql(
            sql=f"SELECT * FROM results_temp;", con=conn
        )

In [10]:
temp_df

Unnamed: 0,index,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,predict_result,predict_probability
0,1,1,1,1,1,1,1.0,1.0,1,1,1.0
1,2,3,4,5,6,7,8.0,9.0,10,11,12.0
2,11,12,13,14,15,16,17.0,18.0,19,1,0.999885


In [8]:
input_dict = {
    "subject_id": [2],
    "pregnancies": [3],
    "glucose": [4],
    "bloodpressure": [5],
    "skinthickness": [6],
    "insulin": [7],
    "bmi": [8],
    "diabetespedigreefunction": [9],
    "age": [10],
}
input_df = pd.DataFrame(input_dict)
input_df["predict_result"] = 11
input_df["predict_probability"] = 12

In [9]:
input_df

Unnamed: 0,subject_id,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,predict_result,predict_probability
0,2,3,4,5,6,7,8,9,10,11,12


In [12]:
input_df.loc[0, 'subject_id'] = 8

In [13]:
input_df

Unnamed: 0,subject_id,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,predict_result,predict_probability
0,8,3,4,5,6,7,8,9,10,11,12


In [10]:
input_df.to_sql("results_temp", con=engine, if_exists="append", index=False)

ProgrammingError: (psycopg2.ProgrammingError) column "subject_id" of relation "results_temp" does not exist
LINE 1: INSERT INTO results_temp (subject_id, pregnancies, glucose, ...
                                  ^

[SQL: INSERT INTO results_temp (subject_id, pregnancies, glucose, bloodpressure, skinthickness, insulin, bmi, diabetespedigreefunction, age, predict_result, predict_probability) VALUES (%(subject_id)s, %(pregnancies)s, %(glucose)s, %(bloodpressure)s, %(skinthickness)s, %(insulin)s, %(bmi)s, %(diabetespedigreefunction)s, %(age)s, %(predict_result)s, %(predict_probability)s)]
[parameters: {'subject_id': 2, 'pregnancies': 3, 'glucose': 4, 'bloodpressure': 5, 'skinthickness': 6, 'insulin': 7, 'bmi': 8, 'diabetespedigreefunction': 9, 'age': 10, 'predict_result': 11, 'predict_probability': 12}]
(Background on this error at: http://sqlalche.me/e/f405)