In [55]:
import pandas as pd

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
file_path = '/content/drive/MyDrive/ADSDB/exploitation/data.csv'

In [58]:
data = pd.read_csv(file_path)

(27483, 8)

In [64]:
# Defining denial constraints check functions
def check_unique_records(df):
    duplicate_mask = df.duplicated(subset=list(data.columns))
    return not duplicate_mask.any()

def check_missings(df):
    missing_values = df.isnull().sum()
    return not missing_values.any()

def check_year_range(df):
    return df['Year'].between(2008, 2022).all() and df['inflation_rate'].ge(-100).all()

def check_quarter_values(df):
    pattern = r'^[12]\d\d\dQ(IV|III|II|I)$'
    valid_quarters = df['Quarter'].str.match(pattern, na=False)
    return valid_quarters.all()

def check_activity_inflation_rate(df):
    return df['activity_rate'].between(0, 100).all() and df['inflation_rate'].ge(-100).all()

def check_house_price_index(df):
    return df['house_price_index'].ge(-100).all()

def check_provinces(df):
    known_provinces = [
        'National Total', 'Castilla - La Mancha', 'Comunitat Valenciana','Andalucía',
        'País Vasco', 'Asturias. Principado de', 'Castilla y León', 'Extremadura', 'Balears. Illes',
        'Cataluña', 'Cantabria', 'Galicia', 'Aragón', 'Madrid. Comunidad de', 'Murcia. Región de',
        'Navarra. Comunidad Foral de', 'Canarias', 'Rioja. La'
        ]
    return df['Provinces'].isin(known_provinces).all()

def check_sex_values(df):
    valid_sexes = ['Both sexes', 'Males', 'Females']
    return df['Sex'].isin(valid_sexes).all()

# Applying and checking constraints
constraints_results = {
    "Unique Records": check_unique_records(data),
    "No missings": check_missings(data),
    "Year Range": check_year_range(data),
    "Quarter Values": check_quarter_values(data),
    "Activity & Inflation Rate": check_activity_inflation_rate(data),
    "House Price Index Positive": check_house_price_index(data),
    "Valid Provinces": check_provinces(data),
    "Valid Sex Values": check_sex_values(data)
}

constraints_results

  and should_run_async(code)



{'Unique Records': True,
 'No missings': True,
 'Year Range': True,
 'Quarter Values': True,
 'Activity & Inflation Rate': True,
 'House Price Index Positive': True,
 'Valid Provinces': True,
 'Valid Sex Values': True}

In [65]:
# Data repairing functions in case of corruptions
def fill_missings(df):
    df = df.fillna(df.median(), inplace=True)
    return df

def remove_duplicates(df):
    df = df.drop_duplicates(inplace=True)
    return df

def repair_data(df):
    if not check_missings(df):
        df = fill_missings(df)
    if not check_unique_records(df):
        df = remove_duplicates(df)
    data.to_csv(file_path, index=False)

In [66]:
repair_data(data)