In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
import numpy as np

# โหลดตัวแปรจาก .env
load_dotenv()

# ดึงค่าจาก environment
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')  
database = 'fininsurance'

# สร้าง engine สำหรับเชื่อมต่อฐานข้อมูล
engine = create_engine(f'mariadb+mariadbconnector://{user}:{password}@{host}:{port}/{database}')

# SQL query
query = """
SELECT cuscode, name,
       CASE 
       WHEN user_registered = '0000-00-00 00:00:00.000' THEN '2000-01-01 00:00:00'
         ELSE user_registered 
       END AS user_registered,
       idcard, card_ins,card_ins_id,card_ins_type,card_ins_life,card_ins_type_life,file_card_ins_life,card_ins_start,card_ins_exp,
       card_ins_life_exp,is_move_card_ins
FROM wp_users WHERE user_login NOT IN ('FINTEST-01', 'FIN-TestApp', 'Admin-VIF', 'adminmag_fin', 'FNG00-00001')


"""


# โหลดข้อมูลจากฐานข้อมูล
df_merged = pd.read_sql(query, engine)

# แปลงให้ pandas เข้าใจได้แน่นอน
df_merged['user_registered'] = pd.to_datetime(df_merged['user_registered'].astype(str), errors='coerce')
df_merged


In [None]:
df_merged = df_merged.drop(columns=['user_registered'])

In [None]:
for col in df_merged.columns:
    print(col)


In [None]:
rename_columns = {
    "cuscode": "agent_id",
    "name": "agent_name",
    "idcard": "id_card",
    "card_ins": "card_ins",
    "card_ins_id": "card_ins_id",
    "card_ins_type": "card_ins_type",
    "card_ins_life": "card_ins_life",
    "card_ins_type_life": "card_ins_type_life",
    "file_card_ins_life": "file_card_ins_life",
    "card_ins_start": "card_ins_start",
    "card_ins_exp": "card_ins_exp",
    "card_ins_life_exp": "card_ins_life_exp",
    "is_move_card_ins": "is_move_card_ins"

}

df = df_merged.rename(columns=rename_columns)
df

In [None]:
for col in df.columns:
    print(col)

In [None]:
df['card_ins_exp'].unique()


In [None]:
import pandas as pd

def convert_thai_to_ad(date_str):
    try:
        if pd.isna(date_str) or date_str == '':
            return None
        day, month, year = map(int, date_str.split('-'))
        if year > 2500:
            year -= 543
        return f"{year:04d}-{month:02d}-{day:02d}"
    except:
        return None

# แปลงทั้งสองคอลัมน์
df['card_ins_start'] = df['card_ins_start'].apply(convert_thai_to_ad)

In [None]:
df['card_ins_start'] = pd.to_datetime(df['card_ins_start'], errors='coerce')


In [None]:
import pandas as pd

def convert_thai_to_ad(date_str):
    try:
        if pd.isna(date_str) or date_str == '':
            return None
        day, month, year = map(int, date_str.split('-'))
        if year > 2500:
            year -= 543
        return f"{year:04d}-{month:02d}-{day:02d}"
    except:
        return None

# ใช้กับคอลัมน์ card_ins_exp
df['card_ins_exp'] = df['card_ins_exp'].apply(convert_thai_to_ad)

# แปลงเป็น datetime ถ้าต้องการ
df['card_ins_exp'] = pd.to_datetime(df['card_ins_exp'], errors='coerce')


In [None]:
df['card_ins_type_life'] = df['card_ins_type_life'].apply(
    lambda x: 'B' if isinstance(x, str) and 'แทน' in x else x
)


In [None]:
df = df.replace(r'^\s*$', pd.NA, regex=True)  
df = df[df.count(axis=1) > 1]
df

In [None]:
import pandas as pd
import numpy as np

# แปลงช่องว่างทุกแบบเป็น NaN ชั่วคราว เพื่อการนับข้อมูล
df_temp = df.replace(r'^\s*$', np.nan, regex=True)

# เพิ่มคอลัมน์ช่วยนับจำนวนข้อมูล (non-null)
df['non_empty_count'] = df_temp.notnull().sum(axis=1)

# >>>> ส่วนที่แก้ไขตรงนี้ <<<<
# ตรวจสอบ agent_id ที่ไม่ว่าง (ไม่ใช่ NaN และไม่ใช่ช่องว่าง)
valid_agent_id_mask = df['agent_id'].astype(str).str.strip().ne('') & df['agent_id'].notna()

# แยกกลุ่มที่ agent_id ไม่ว่างและ agent_id ว่าง
df_with_id = df[valid_agent_id_mask]
df_without_id = df[~valid_agent_id_mask]

# คัดแถวที่ agent_id ซ้ำ โดยเก็บแถวที่มีข้อมูลมากที่สุด
df_with_id_cleaned = df_with_id.sort_values('non_empty_count', ascending=False).drop_duplicates(subset='agent_id', keep='first')

# รวมกลับ
df_cleaned = pd.concat([df_with_id_cleaned, df_without_id], ignore_index=True)

# ลบคอลัมน์ช่วย
df_cleaned = df_cleaned.drop(columns=['non_empty_count'])
df_cleaned = df_cleaned.replace(
    to_replace=r'^\s*$|(?i:^none$)|^-$',  # << แก้ตรงนี้
    value=np.nan,
    regex=True
)


df_cleaned.columns = df_cleaned.columns.str.lower()
df_cleaned


In [None]:
df_cleaned.replace(np.nan, "NaN").isin(["none", "-", "None"]).sum()
df_cleaned

In [None]:
# df.to_csv('dim_car3.csv', index=False)

In [None]:
df_cleaned = df_cleaned.replace(r'^\.$', np.nan, regex=True)
df_cleaned

In [None]:
for col in df_cleaned.columns:
    print(col)

In [None]:
import numpy as np

# แปลง hire_date เป็น epoch
df_cleaned["hire_date"] = pd.to_datetime(df_cleaned["hire_date"], errors='coerce')
df_cleaned["hire_date"] = df_cleaned["hire_date"].astype('int64') // 10**9

# ถ้าต้องการให้ NaN เป็น None สำหรับ SQLAlchemy
df_cleaned["hire_date"] = df_cleaned["hire_date"].where(df_cleaned["hire_date"].notnull(), None)


In [None]:
# ถ้ามี NaN ให้เติมเป็น 0 ก่อน
df_cleaned['is_move_card_ins'] = df_cleaned['is_move_card_ins'].fillna(0)

# แปลงจาก float → int
df_cleaned['is_move_card_ins'] = df_cleaned['is_move_card_ins'].astype(int)

# แปลงจาก int → bool
df_cleaned['is_move_card_ins'] = df_cleaned['is_move_card_ins'].astype(bool)

# ตรวจสอบผลลัพธ์ก่อนส่ง
print(df_cleaned['is_move_card_ins'].dtype)
print(df_cleaned['is_move_card_ins'].unique())


In [None]:
df_cleaned = df_cleaned.replace(r'^\s*$', np.nan, regex=True)


อาจจะมีปัญหา 
    |
    V

In [None]:
df_cleaned.rename(columns={'agent_id': 'card_ins_id'}, inplace=True)


In [None]:
import pandas as pd
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv

load_dotenv()

host = os.getenv('DB_HOST_test')
user = os.getenv('DB_USER_test')
password = os.getenv('DB_PASSWORD_test')
port = os.getenv('DB_PORT_test')
database = 'fininsurance'  

engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}')
rows_inserted = df_cleaned.to_sql('fact_card_ins', engine, if_exists='append', index=False)


In [None]:
# ตรวจสอบความยาวสูงสุดทุกคอลัมน์
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == object:
        max_len = df_cleaned[col].astype(str).map(len).max()
        print(f"{col}: max length = {max_len}")
