In [None]:
import IPython
import sys

def clean_notebook():
    IPython.display.clear_output(wait=True)
    print("Notebook cleaned.")

!pip install --upgrade evidently==0.7.14

# Clean up the notebook
clean_notebook()

# ตรวจสอบคุณภาพข้อมูล (Data Quality Report)

#### 1. Iris Data

In [None]:
import pandas as pd
from sklearn.datasets import load_iris


# โหลด dataset จาก sklearn ได้ DataFrame โดยตรง
df_iris = load_iris(as_frame=True).frame
df_iris.head(10)


In [None]:
import pandas as pd
from sklearn.datasets import load_iris

from evidently import Report
from evidently.metrics import *
from evidently.presets import *


# สร้างรายงาน Data Quality
report = Report([ DataSummaryPreset()])
eval = report.run(df_iris,None)

eval.save_html("iris_data_quality_report.html")




### 2. ITanic Data

In [30]:
import pandas as pd
from sklearn.datasets import fetch_openml

# Download Titanic dataset from OpenML
titanic = fetch_openml("titanic", version=1, as_frame=True)

# Convert to pandas DataFrame
df = titanic.frame

# 1) จัดชนิดคอลัมน์ให้เหมาะกับ metric ที่ใช้
df_titanic = df.assign(
    survived = df["survived"].astype("int64"),     # target เป็นตัวเลข 0/1
    sex      = df["sex"].astype("category"),       # ใช้ groupby/นับ category
    embarked = df["embarked"].astype("category"),
    ticket   = df["ticket"].astype("category"),    # 🔧 สำคัญ: เปลี่ยนจาก text -> category
    pclass   = df["pclass"].astype("int64")        # คงเป็นตัวเลขไว้ ใช้ groupby ได้
)


df_titanic.head(20)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [31]:
import pandas as pd
from sklearn.datasets import load_iris

from evidently import Report
from evidently.metrics import *
from evidently.presets import *


# สร้างรายงาน Data Quality
report = Report([ DataSummaryPreset()])
eval = report.run(df_titanic,None)

eval.save_html("titanic_data_quality_report.html")




In [None]:
from evidently import Report
from evidently.metrics import *
from evidently.presets import *
import numpy as np

# Create mockup DataFrame
np.random.seed(42)

df = pd.DataFrame({
    'age': np.random.randint(18, 70, 1000),
    'income': np.random.randint(20000, 150000, 1000),
    'credit_score': np.random.randint(300, 850, 1000),
    'loan_amount': np.random.randint(5000, 50000, 1000),
    'employment_years': np.random.randint(0, 30, 1000),
    'debt_ratio': np.random.uniform(0, 1, 1000),
    'num_credit_lines': np.random.randint(1, 10, 1000),
    'category': np.random.choice(['A', 'B', 'C', 'D'], 1000),
    'approved': np.random.choice([0, 1], 1000, p=[0.3, 0.7])
})


# reference = ข้อมูล train
reference = df.sample(500, random_state=42)

# current = ข้อมูลใหม่ที่เข้าโมเดล
current = df.sample(500, random_state=99)


print(f"DataFrame shape: {df.shape}")
df.head(10)


In [None]:
df.columns.tolist()

#### Create Data dfift

In [None]:
# =========================
# 2) Inject Drift in 2 Columns
#    - Numeric drift:  income  (ขยับค่าเฉลี่ย/สเกลให้สูงขึ้น)
#    - Categorical drift: category (บิดสัดส่วนให้ skew มากขึ้น)
# =========================

 # Income drift (shift & scale), แล้ว clip ให้อยู่ในช่วงสมเหตุสมผล
current['income'] = (current['income'] * 1.6 + 10_000).clip(lower=15_000, upper=300_000).astype(int)

#  Category drift (เปลี่ยน distribution ให้ 'D' เด่นขึ้น)
new_cats = np.random.choice(['A', 'B', 'C', 'D'], size=len(current), p=[0.10, 0.10, 0.10, 0.70])
current['category'] = new_cats



In [None]:

# สร้างรายงาน Data Drift
drift_report = Report([
    DataDriftPreset()
])

my_eval = drift_report.run(reference,current)

my_eval.save_html("data_drift_report.html")



In [None]:
report = Report([
    DataDriftPreset(column=["target", "prediction"])
])

my_eval = report.run(eval_data_1, eval_data_2)
my_eval
#my_eval.json

In [None]:
metrics = [
    DataDriftPreset(['age','income','credit_score','category'])
]

drift_report = Report(metrics)
result = drift_report.run(reference_data=reference, current_data=current)

# Save HTML
result