In [22]:
import pandas as pd
import numpy as np



In [55]:
df = pd.read_csv('Data.csv')

print("Rows:", df.shape[0])
print("Columns:", df.shape[1])
df.info()
df.describe(include='all')

Rows: 119
Columns: 25
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Book_ID               119 non-null    int64  
 1   Title                 119 non-null    object 
 2   Author                119 non-null    object 
 3   Year_Published        119 non-null    int64  
 4   Year_Banned           119 non-null    int64  
 5   Decade_Published      119 non-null    int64  
 6   Decade_Banned         119 non-null    int64  
 7   Years_To_Ban          119 non-null    int64  
 8   Ban_Duration_Years    3 non-null      float64
 9   Country_Region        119 non-null    object 
 10  Region_Code           119 non-null    object 
 11  Regime_Type           119 non-null    object 
 12  Ban_Level             119 non-null    object 
 13  Stated_Ban_Reason     119 non-null    object 
 14  Reason_Category       119 non-null    object 
 15  T

Unnamed: 0,Book_ID,Title,Author,Year_Published,Year_Banned,Decade_Published,Decade_Banned,Years_To_Ban,Ban_Duration_Years,Country_Region,...,Theme_Tags,Theme_Count,Repeat_Ban,Author_Ban_Frequency,Reinstated,Legal_Challenge,Conflict_Period,Press_Freedom_Score,Source_Reference,Notes
count,119.0,119,119,119.0,119.0,119.0,119.0,119.0,3.0,119,...,119,119.0,119.0,119.0,119.0,119.0,119.0,0.0,119,8
unique,,119,93,,,,,,,30,...,107,,,,,,,,28,8
top,,Ulysses,Khaled Hosseini,,,,,,,United States,...,youth;identity;language,,,,,,,,ALA archives,Landmark obscenity case
freq,,1,7,,,,,,,61,...,3,,,,,,,,48,1
mean,60.714286,,,1861.865546,1955.571429,1827.394958,1928.739496,93.705882,21.666667,,...,,3.0,0.411765,1.731092,0.87395,0.092437,0.184874,,,
std,34.852508,,,412.520999,89.022238,413.747778,118.261092,372.969403,11.015141,,...,,0.0,0.494234,1.125193,0.33331,0.290867,0.389837,,,
min,1.0,,,-750.0,1559.0,-750.0,1550.0,0.0,11.0,,...,,3.0,0.0,1.0,0.0,0.0,0.0,,,
25%,30.5,,,1930.5,1953.0,1885.0,1930.0,3.0,16.0,,...,,3.0,0.0,1.0,1.0,0.0,0.0,,,
50%,61.0,,,1965.0,1980.0,1950.0,1980.0,8.0,21.0,,...,,3.0,0.0,1.0,1.0,0.0,0.0,,,
75%,90.5,,,1992.0,2004.0,1990.0,2000.0,20.0,27.0,,...,,3.0,1.0,2.0,1.0,0.0,0.0,,,


In [57]:
import pandas as pd
import numpy as np

# -------------------------
# Load dataset
# -------------------------
df = pd.read_csv("data.csv")

print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

# -------------------------
# Required columns
# -------------------------
required_columns = [
    "Book_ID","Title","Author","Year_Published","Year_Banned",
    "Decade_Published","Decade_Banned","Years_To_Ban",
    "Country_Region","Region_Code","Regime_Type","Ban_Level",
    "Reason_Category","Theme_Tags","Theme_Count",
    "Repeat_Ban","Author_Ban_Frequency","Reinstated",
    "Legal_Challenge","Conflict_Period","Source_Reference"
]

missing_cols = set(required_columns) - set(df.columns)
assert not missing_cols, f"Missing columns: {missing_cols}"

# -------------------------
# Duplicate checks
# -------------------------
assert df["Book_ID"].is_unique, "Duplicate Book_ID detected"

# -------------------------
# Temporal logic checks
# -------------------------
bad_years = df[df["Year_Banned"] < df["Year_Published"]]
print("Year logic violations:", len(bad_years))

df["Years_To_Ban_calc"] = df["Year_Banned"] - df["Year_Published"]
year_mismatch = df[df["Years_To_Ban"] != df["Years_To_Ban_calc"]]
print("Years_To_Ban mismatches:", len(year_mismatch))

# -------------------------
# Decade validation
# -------------------------
def decade_from_year(y):
    return f"{int(y // 10) * 10}s"

decade_pub_mismatch = df[
    df["Decade_Published"] != df["Year_Published"].apply(decade_from_year)
]
decade_ban_mismatch = df[
    df["Decade_Banned"] != df["Year_Banned"].apply(decade_from_year)
]

print("Decade_Published mismatches:", len(decade_pub_mismatch))
print("Decade_Banned mismatches:", len(decade_ban_mismatch))

# -------------------------
# Locked vocabularies
# -------------------------
valid_region = {"NA","EU","SA","EA","MENA","LATAM","SSA","SEA","GLOBAL"}
valid_regime = {"Democracy","Hybrid","Authoritarian","Colonial"}
valid_ban_level = {"School","Library","National","Mixed","Informal"}
valid_reason = {
    "Sexuality","Political","Religious","Race","Language",
    "Moral","Ideological","Violence","Social_Order"
}

print("Invalid Region_Code:",
      df.loc[~df["Region_Code"].isin(valid_region), "Region_Code"].unique())

print("Invalid Regime_Type:",
      df.loc[~df["Regime_Type"].isin(valid_regime), "Regime_Type"].unique())

print("Invalid Ban_Level:",
      df.loc[~df["Ban_Level"].isin(valid_ban_level), "Ban_Level"].unique())

print("Invalid Reason_Category:",
      df.loc[~df["Reason_Category"].isin(valid_reason), "Reason_Category"].unique())

# -------------------------
# Theme count validation
# -------------------------
def count_themes(x):
    if pd.isna(x):
        return 0
    return len(x.split(";"))

theme_mismatch = df[
    df["Theme_Count"] != df["Theme_Tags"].apply(count_themes)
]

print("Theme_Count mismatches:", len(theme_mismatch))

# -------------------------
# Repeat ban logic
# -------------------------
repeat_logic_violations = df[
    (df["Repeat_Ban"] == 1) & (df["Author_Ban_Frequency"] < 2)
]

print("Repeat_Ban logic violations:", len(repeat_logic_violations))

# -------------------------
# Binary field validation
# -------------------------
binary_fields = ["Repeat_Ban","Reinstated","Legal_Challenge","Conflict_Period"]

for col in binary_fields:
    bad_vals = df.loc[~df[col].isin([0,1,np.nan]), col].unique()
    print(f"{col} invalid values:", bad_vals)

# -------------------------
# Missing value audit
# -------------------------
missing_summary = df.isna().sum().sort_values(ascending=False)
print("\nMissing values per column:\n", missing_summary)


Rows: 119
Columns: 25
Year logic violations: 0
Years_To_Ban mismatches: 0
Decade_Published mismatches: 119
Decade_Banned mismatches: 119
Invalid Region_Code: ['US' 'CA']
Invalid Regime_Type: []
Invalid Ban_Level: []
Invalid Reason_Category: []
Theme_Count mismatches: 0
Repeat_Ban logic violations: 0
Repeat_Ban invalid values: []
Reinstated invalid values: []
Legal_Challenge invalid values: []
Conflict_Period invalid values: []

Missing values per column:
 Press_Freedom_Score     119
Ban_Duration_Years      116
Notes                   111
Book_ID                   0
Reason_Category           0
Source_Reference          0
Conflict_Period           0
Legal_Challenge           0
Reinstated                0
Author_Ban_Frequency      0
Repeat_Ban                0
Theme_Count               0
Theme_Tags                0
Stated_Ban_Reason         0
Title                     0
Ban_Level                 0
Regime_Type               0
Region_Code               0
Country_Region            0
Years_To