## Data Validation Experiment using evidently

In [1]:
import pandas as pd 
from evidently.dashboard import Dashboard
from evidently.tabs import DataDriftTab , CatTargetDriftTab
from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection



In [2]:
# load the data
df = pd.read_csv("laptop_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [3]:
from sklearn.model_selection import train_test_split

train_set , test_set = train_test_split(df , test_size = 0.2 , random_state = 42)

In [4]:
train_set.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
10,10,HP,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,4GB,500GB HDD,Intel HD Graphics 620,No OS,1.86kg,20986.992
147,147,Asus,Notebook,15.6,Full HD 1920x1080,Intel Celeron Dual Core N3350 1.1GHz,4GB,1TB HDD,Intel HD Graphics 500,Windows 10,2kg,18328.32
1287,1287,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.92
767,767,Dell,Gaming,15.6,IPS Panel Touchscreen / 4K Ultra HD 3840x2160,Intel Core i7 6700HQ 2.6GHz,16GB,128GB SSD + 1TB HDD,Nvidia GeForce GTX 960M,Windows 10,2.72kg,58554.72
816,816,Razer,Ultrabook,12.5,IPS Panel 4K Ultra HD / Touchscreen 3840x2160,Intel Core i7 7500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 620,Windows 10,1.29kg,95850.72


In [5]:
data_drift_dashboard = Dashboard(tabs = [DataDriftTab()])

In [6]:
data_drift_dashboard.calculate(reference_data = train_set , current_data = test_set)

In [7]:
# save as a html file
data_drift_dashboard.save("data_drift_test.html")

In [8]:
data_drift_profile = Profile(sections = [DataDriftProfileSection()])

In [9]:
data_drift_profile.calculate(reference_data = train_set , current_data = test_set)

In [10]:
report = data_drift_profile.json()

In [11]:
import json

json_report = json.loads(report)
json_report

{'data_drift': {'name': 'data_drift',
  'datetime': '2025-10-28 05:24:00.990244',
  'data': {'utility_columns': {'date': None,
    'id': None,
    'target': None,
    'prediction': None},
   'num_feature_names': ['Inches', 'Price', 'Unnamed: 0'],
   'cat_feature_names': ['Company',
    'Cpu',
    'Gpu',
    'Memory',
    'OpSys',
    'Ram',
    'ScreenResolution',
    'TypeName',
    'Weight'],
   'text_feature_names': [],
   'datetime_feature_names': [],
   'target_names': None,
   'options': {'confidence': None,
    'drift_share': 0.5,
    'nbinsx': 10,
    'xbins': None},
   'metrics': {'n_features': 12,
    'n_drifted_features': 7,
    'share_drifted_features': 0.5833333333333334,
    'dataset_drift': True,
    'Inches': {'current_small_hist': {'x': [11.6,
       12.17,
       12.74,
       13.31,
       13.879999999999999,
       14.45,
       15.02,
       15.59,
       16.16,
       16.73,
       17.3],
      'y': [0.04705249714324122,
       0.020165355918531955,
       0.21509

In [12]:
n_features = json_report["data_drift"]["data"]["metrics"]["n_features"]
n_drifted_features = json_report["data_drift"]["data"]["metrics"]["n_drifted_features"]
data_drift_percentage = (n_drifted_features / n_features) * 100

data_drift_percentage

58.333333333333336

In [13]:
drift_status = json_report["data_drift"]["data"]["metrics"]["dataset_drift"]
drift_status

True

### As data drift detected that will make a huge impact on our model's performance. Also it's a validation issue but we will ignore this in component part for now.

## Data Validation using pandera

In [14]:
import pandera as pa
from pandera import Column , DataFrameSchema , Check

In [15]:
df.columns

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

In [16]:
for col in df.columns.to_list():
    print(f"Column name: {col}. Unique values: {df[col].unique()}")

Column name: Unnamed: 0. Unique values: [   0    1    2 ... 1300 1301 1302]
Column name: Company. Unique values: ['Apple' 'HP' 'Acer' 'Asus' 'Dell' 'Lenovo' 'Chuwi' 'MSI' 'Microsoft'
 'Toshiba' 'Huawei' 'Xiaomi' 'Vero' 'Razer' 'Mediacom' 'Samsung' 'Google'
 'Fujitsu' 'LG']
Column name: TypeName. Unique values: ['Ultrabook' 'Notebook' 'Netbook' 'Gaming' '2 in 1 Convertible'
 'Workstation']
Column name: Inches. Unique values: [13.3 15.6 15.4 14.  12.  11.6 17.3 10.1 13.5 12.5 13.  18.4 13.9 12.3
 17.  15.  14.1 11.3]
Column name: ScreenResolution. Unique values: ['IPS Panel Retina Display 2560x1600' '1440x900' 'Full HD 1920x1080'
 'IPS Panel Retina Display 2880x1800' '1366x768'
 'IPS Panel Full HD 1920x1080' 'IPS Panel Retina Display 2304x1440'
 'IPS Panel Full HD / Touchscreen 1920x1080'
 'Full HD / Touchscreen 1920x1080' 'Touchscreen / Quad HD+ 3200x1800'
 'IPS Panel Touchscreen 1920x1200' 'Touchscreen 2256x1504'
 'Quad HD+ / Touchscreen 3200x1800' 'IPS Panel 1366x768'
 'IPS Panel 4K U

In [19]:
df['Price'].max()

324954.72

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price             1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB


In [21]:
data_schema_map = {
    "Company": {
        "type": "category",
        "allowed_values": [
            'Apple', 'HP', 'Acer', 'Asus', 'Dell', 'Lenovo', 'Chuwi', 'MSI', 'Microsoft',
            'Toshiba', 'Huawei', 'Xiaomi', 'Vero', 'Razer', 'Mediacom', 'Samsung', 'Google',
            'Fujitsu', 'LG'
        ]
    },
    "TypeName": {
        "type": "category",
        "allowed_values": [
            'Ultrabook', 'Notebook', 'Netbook', 'Gaming', '2 in 1 Convertible', 'Workstation'
        ]
    },
    "Inches": {
        "type": "float",
        "allowed_range": [10.0, 18.5]
    },
    "ScreenResolution": {
        "type": "string",
        "description": "Text containing resolution info and panel type"
    },
    "Cpu": {
        "type": "string",
        "description": "Full CPU name with frequency (e.g., Intel Core i5 7200U 2.5GHz)"
    },
    "Ram": {
        "type": "string",
        "allowed_values": ['2GB', '4GB', '6GB', '8GB', '12GB', '16GB', '24GB', '32GB', '64GB']
    },
    "Memory": {
        "type": "string",
        "description": "Storage configuration (SSD, HDD, Hybrid, Flash Storage)"
    },
    "Gpu": {
        "type": "string",
        "description": "GPU brand and model (Intel, AMD, Nvidia, ARM)"
    },
    "OpSys": {
        "type": "category",
        "allowed_values": [
            'macOS', 'No OS', 'Windows 10', 'Mac OS X', 'Linux', 'Android',
            'Windows 10 S', 'Chrome OS', 'Windows 7'
        ]
    },
    "Weight": {
        "type": "string",
        "description": "Weight string like '1.37kg', should be converted to float (kg)"
    },
    "Price": {
        "type": "float",
        "allowed_range": [9270, 324955]
    }
}


In [None]:
laptop_data_schema = DataFrameSchema({

    "Company": Column(
        str,
        Check.isin([
            'Apple', 'HP', 'Acer', 'Asus', 'Dell', 'Lenovo', 'Chuwi', 'MSI', 'Microsoft',
            'Toshiba', 'Huawei', 'Xiaomi', 'Vero', 'Razer', 'Mediacom', 'Samsung', 'Google',
            'Fujitsu', 'LG'
        ]),
        nullable=False
    ),

    "TypeName": Column(
        str,
        Check.isin([
            'Ultrabook', 'Notebook', 'Netbook', 'Gaming', '2 in 1 Convertible', 'Workstation'
        ]),
        nullable=False
    ),

    "Inches": Column(
        float,
        Check.in_range(10.0, 18.5),
        nullable=False
    ),

    "ScreenResolution": Column(str, nullable=False),

    "Cpu": Column(str, nullable=False),

    "Ram": Column(
        str,
        Check.isin(['2GB', '4GB', '6GB', '8GB', '12GB', '16GB', '24GB', '32GB', '64GB']),
        nullable=False
    ),

    "Memory": Column(str, nullable=False),

    "Gpu": Column(str, nullable=False),

    "OpSys": Column(
        str,
        Check.isin([
            'macOS', 'No OS', 'Windows 10', 'Mac OS X', 'Linux', 'Android',
            'Windows 10 S', 'Chrome OS', 'Windows 7'
        ]),
        nullable=False
    ),

    "Weight": Column(str, nullable=False),

    "Price": Column(
        float,
        Check.in_range(9270, 324955),
        nullable=False
    )
})

In [24]:
import pandera.pandas as pa
def build_pandera_schema_from_map(schema_map: dict) -> DataFrameSchema:
    schema_columns = {}

    for col, rules in schema_map.items():
        col_type = rules.get("type")
        allowed_values = rules.get("allowed_values")
        allowed_range = rules.get("allowed_range")

        # Infer Pandera dtype
        if col_type in ["float", "int"]:
            dtype = float if col_type == "float" else int
        else:
            dtype = str

        # Build column checks
        checks = []
        if allowed_values:
            checks.append(Check.isin(allowed_values))
        if allowed_range:
            checks.append(Check.in_range(allowed_range[0], allowed_range[1]))

        schema_columns[col] = Column(dtype, checks=checks, nullable=False)

    # Return Pandera schema object
    return DataFrameSchema(schema_columns)


In [25]:
laptop_schema = build_pandera_schema_from_map(data_schema_map)

In [26]:
validated_df = laptop_schema.validate(df)