In [22]:
from datetime import datetime
import re
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt


class AdvancedDataQualityEvaluator:
    def __init__(self, dataframe, weights=None, preprocessing_complexity=None):
        self.df = dataframe
        self.weights = weights or self.default_weights()
        self.preprocessing_complexity = preprocessing_complexity or 1

    def default_weights(self):
        weights = {}
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                weights[col] = 1.5
            else:
                weights[col] = 1
        return weights

    def completeness(self):
        completeness_scores = (self.df[col].notnull().mean() for col in self.df.columns)
        return np.average(list(completeness_scores), weights=list(self.weights.values()))

    def uniqueness(self):
        unique_scores = (self.df[col].nunique() / len(self.df[col]) for col in self.df.columns)
        return np.average(list(unique_scores), weights=list(self.weights.values()))

    def consistency(self):
        consistency_scores = []
        for col in self.df.columns:
            most_common_type = self.df[col].map(type).mode()[0]
            consistency = (self.df[col].map(type) == most_common_type).mean()
            consistency_scores.append(consistency)
        return np.average(consistency_scores, weights=list(self.weights.values()))


    def readability(self, column):
        if column in self.df.columns and self.df[column].dtype == 'object':
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(self.df[column].astype(str))
            cosine_sim = cosine_similarity(tfidf_matrix)
            return np.mean(cosine_sim.diagonal())
        return 1


    def calculate_overall_quality(self):
      completeness_score = self.completeness()
      uniqueness_score = self.uniqueness()
      consistency_score = self.consistency()
      readability_scores = [self.readability(col) for col in self.df.columns if self.df[col].dtype == 'object']
      average_readability = np.mean(readability_scores) if readability_scores else 1

      scores = (completeness_score + uniqueness_score + consistency_score + average_readability) / 4
      per = [completeness_score, uniqueness_score, consistency_score, average_readability]
      overall_score = scores
      return overall_score * 100


    def quality_color_indicator(self, quality_score):
        if quality_score >= 80:
            return "Green"
        elif 60 <= quality_score < 80:
            return "Yellow"
        elif 40 <= quality_score < 60:
            return "Orange"
        else:
            return "Red"


In [5]:
import pandas as pd

df = pd.read_csv('animes.csv')
df2=df.dropna()

In [23]:
evaluator = AdvancedDataQualityEvaluator(df)
overall_quality = evaluator.calculate_overall_quality()
print(f"Overall Data Quality: {overall_quality:.2f}%")
print("Quality Color Indicator:", evaluator.quality_color_indicator(overall_quality))

[0.9784941225208432, 0.6009752645297153, 0.9940189529283827, 0.9990765194276147]
Overall Data Quality Score: 89.3141214851639
Quality Color Indicator: Green
