In [19]:
import pandas as pd
import great_expectations as ge

In [20]:
# Load the cleaned dataset
file_path = "P2M3_Nisrina_data_clean.csv"
df = pd.read_csv(file_path)
# Inisialisasi dataset dengan Great Expectations
dataset = ge.from_pandas(df)

In [21]:
# 1. to be unique: Expect a column to be unique
dataset.expect_column_values_to_be_unique(column="index")

{
  "success": true,
  "result": {
    "element_count": 1064,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [22]:
# 2.1 to be between min and max: Expect "publishing_year" to be between 1900 and 2025
dataset.expect_column_values_to_be_between(column="publishing_year", min_value=1300, max_value=2025)


{
  "success": true,
  "result": {
    "element_count": 1064,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [23]:
# 2.2 Expect "units_sold" to be between 0 and 100000
dataset.expect_column_values_to_be_between(column="units_sold", min_value=0, max_value=80000)


{
  "success": true,
  "result": {
    "element_count": 1064,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [24]:
# 2.3 Expect "sales_rank" to be between 1 and 2000
dataset.expect_column_values_to_be_between(column="sales_rank", min_value=1, max_value=2000)


{
  "success": true,
  "result": {
    "element_count": 1064,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [25]:
# 3. to be in set: Expect a column to have values in a set
dataset.expect_column_values_to_be_in_set(
    column="language_code",
    value_set=["en-US", "eng", "spa", "en-GB", "fre", "en-CA", "nl","ara"]
)


{
  "success": true,
  "result": {
    "element_count": 1064,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [26]:
# 4. to be in type list: Expect a column to have a specific data type
dataset.expect_column_values_to_be_of_type(column="publishing_year", type_="int64")


{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [27]:
# 5.1 Expect median of column units_sold to be between 10 and 3950
# untuk memvalidasi distribusi data dan memastikan median berada dalam nilai yang diharapkan untuk menjaga kualitas data.

dataset.expect_column_median_to_be_between(
    column="units_sold",
    min_value=10,
    max_value=3950
)


{
  "success": true,
  "result": {
    "observed_value": 3937.5,
    "element_count": 1064,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [28]:
# 5.2 Expect median of column "book_average_rating" to be between 3 and 4.5
dataset.expect_column_median_to_be_between(
    column="book_average_rating",
    min_value=3,
    max_value=4.5
)


{
  "success": true,
  "result": {
    "observed_value": 4.02,
    "element_count": 1064,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [29]:
# 6. Validate the proportion of unique values in the "author" column
dataset.expect_column_proportion_of_unique_values_to_be_between(
    column="author",  # Kolom yang diuji adalah "author"
    min_value=0.6, # Nilai minimum 10% untuk proporsi unik
    max_value=1.0 # Nilai maksimum 100% untuk proporsi unik
)

{
  "success": true,
  "result": {
    "observed_value": 0.6860902255639098,
    "element_count": 1064,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [30]:
# 7. Expectation to verify the number of columns in the table
dataset.expect_table_column_count_to_equal(
    value=15  # 15 adalah jumlah kolom yang diharapkan
)


{
  "success": true,
  "result": {
    "observed_value": 15
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [31]:
# 8. Expectation to verify the length of values in the "book_name" column is between 3 and 150
dataset.expect_column_value_lengths_to_be_between(
    column="book_name",  
    min_value=3,
    max_value=150
)

{
  "success": true,
  "result": {
    "element_count": 1064,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [32]:
# Validate the dataset
results = dataset.validate()
print(results)


{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_unique",
        "kwargs": {
          "column": "index",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 1064,
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_between",
        "kwargs": {
          "column": "publishing_year",
          "min_value": 1300,
          "max_value": 2025,
 