In [1]:
import pandas as pd

In [7]:
# Step 1: Load the dataset
# Replace 'augmented_dataset.csv' with the actual file name
data = pd.read_csv('cattle_growth_data.csv')

In [9]:
# Step 2: Define Quality checks functions

def check_missing_values(df):
    """Check for missing values in the dataset."""
    missing_values = df.isnull().sum()
    print("Missing Values:\n", missing_values)


def check_duplicates(df):
    """Check for duplicate rows in the dataset."""
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")

In [11]:
def check_negative_values(df, columns):
    """Check for negative values in specified columns."""
    for col in columns:
        negative_count = (df[col] < 0).sum()
        print(f"Negative values in {col}: {negative_count}")


def check_value_ranges(df):
    """Check if values are within acceptable ranges."""
    invalid_age = df[~df['Age_Months'].between(0, 240)]  # Assuming age range is 0-240 months
    invalid_weight = df[~df['Weight_kg'].between(0, 200)]  # Assuming weight range is 0-200 kg

    print(f"Rows with invalid age: {len(invalid_age)}")
    print(f"Rows with invalid weight: {len(invalid_weight)}")

In [13]:
def check_outliers(df, columns):
    """Check for outliers using the IQR method."""
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        print(f"Outliers in {col}: {len(outliers)}")

In [15]:
# Step 3: Perform data quality checks
print("Starting data quality checks...\n")

check_missing_values(data)
check_duplicates(data)
check_negative_values(data, ['Age_Months', 'Weight_kg'])
check_value_ranges(data)
check_outliers(data, ['Age_Months', 'Weight_kg'])

print("\nData quality checks completed.")

Starting data quality checks...

Missing Values:
 ID             0
Age_Months     0
breed_group    0
Weight_kg      0
dtype: int64
Number of duplicate rows: 0
Negative values in Age_Months: 0
Negative values in Weight_kg: 0
Rows with invalid age: 0
Rows with invalid weight: 643
Outliers in Age_Months: 3
Outliers in Weight_kg: 11

Data quality checks completed.
