In [1]:
import pandas as pd
import numpy as np

# Sample dataset
data = {
    'Name': ['Alice', 'Bob', np.nan, 'Diane'],
    'Age': [25, 30, 35, np.nan],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example', 'diane@example.com']
}

In [2]:
df = pd.DataFrame(data)

# Completeness Metric
completeness_score = df.notna().sum().sum() / df.size

In [3]:
# Consistency Metric (e.g., valid email format)
valid_email_pattern = df['Email'].str.contains(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')
consistency_score = valid_email_pattern.mean()

# Average the scores (assuming equal weights)
data_quality_score = (completeness_score + consistency_score) / 2

# Convert to percentage
data_quality_percentage = data_quality_score * 100

In [4]:

print(f"Data Quality Score: {data_quality_percentage:.2f}%")

Data Quality Score: 79.17%


In [8]:
df2 = df.dropna()

In [9]:
# Completeness Metric
completeness_score = df2.notna().sum().sum() / df2.size

# Consistency Metric (e.g., valid email format)
valid_email_pattern = df2['Email'].str.contains(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')
consistency_score = valid_email_pattern.mean()

# Average the scores (assuming equal weights)
data_quality_score = (completeness_score + consistency_score) / 2

# Convert to percentage
data_quality_percentage = data_quality_score * 100

print(f"Data Quality Score: {data_quality_percentage:.2f}%")

Data Quality Score: 100.00%


In [63]:
class DataQualityEvaluator:
    def __init__(self, dataframe, weights=None):
        self.df = dataframe
        self.weights = weights or {col: 1 for col in dataframe.columns}

    def completeness(self):
        return self.df.notnull().sum().sum() / self.df.size

    def weighted_completeness(self):
        weighted_scores = list(map(lambda col: (self.df[col].notnull().sum() / len(self.df[col])) * self.weights[col], self.df.columns))
        total_weight = sum(self.weights.values())
        return sum(weighted_scores) / total_weight

    def timeliness(self, column):
        if self.df[column].dtype == 'datetime64[ns]':
            now = datetime.now()
            return self.df[column].apply(lambda x: (now - x).days).mean()
        else:
            return None

    def uniqueness(self):
        unique_scores = list(map(lambda col: self.df[col].nunique() / len(self.df[col]) if self.df[col].dtype in ['int64', 'float64', 'object', 'bool'] else 1, self.df.columns))
        return sum(unique_scores) / len(unique_scores)

    def range_validity(self, column, min_val, max_val):
        if self.df[column].dtype in ['int64', 'float64']:
            return self.df[column].apply(lambda x: min_val <= x <= max_val).mean()
        else:
            return None

    def format_conformity(self, column, regex_pattern):
        if self.df[column].dtype == 'object':
            return self.df[column].str.match(regex_pattern).mean()
        else:
            return None

    def calculate_overall_quality(self):
        completeness_score = self.weighted_completeness()
        timeliness_scores = list(filter(None, [self.timeliness(col) for col in self.df.columns if self.df[col].dtype == 'datetime64[ns]']))
        uniqueness_score = self.uniqueness()
        range_validity_scores = list(filter(None, [self.range_validity(col, 0, 100) for col in self.df.columns if self.df[col].dtype in ['int64', 'float64']]))
        format_conformity_scores = list(filter(None, [self.format_conformity(col, r'^[A-Za-z]+$') for col in self.df.columns if self.df[col].dtype == 'object']))  # Example for string format

        average_timeliness = sum(timeliness_scores) / len(timeliness_scores) if timeliness_scores else 1
        average_range_validity = sum(range_validity_scores) / len(range_validity_scores) if range_validity_scores else 1
        average_format_conformity = sum(format_conformity_scores) / len(format_conformity_scores) if format_conformity_scores else 1

        overall_score = (completeness_score + average_timeliness + uniqueness_score + average_range_validity + average_format_conformity) / 5
        return overall_score

    def quality_color_indicator(self, quality_score):
        if quality_score >= 0.7:
            return "Green"
        elif 0.5 <= quality_score < 0.7:
            return "Orange"
        else:
            return "Red"

In [61]:
import pandas as pd

df = pd.read_csv('animes.csv')
df2=df.dropna()

In [67]:
quality_evaluator = DataQualityEvaluator(df2)
quality_score = quality_evaluator.calculate_overall_quality()
color_indicator = quality_evaluator.quality_color_indicator(quality_score)
quality_score = 100*quality_score

print(f"Overall Data Quality Score: {quality_score:.2f}%")
print(f"Data Quality Color Indicator: {color_indicator}")

Overall Data Quality Score: 61.17%
Data Quality Color Indicator: Orange
