In [3]:
import pandas as pd
import os

wine = pd.read_csv('winequality-red.csv')


In [6]:
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [5]:
# Class to encapsulate data cleaning functionalities

class DataCleaning:

    def __init__(self, file_path=None):
        # Load a dataframe if a file path is provided
        self.dataframe = self.load_dataframe(file_path) if file_path else None

    def load_dataframe(self, file_path):
        # Check if the file path is valid from OS
        if not os.path.exists(file_path):
            raise FileNotFoundError(f'File {file_path} not found')
        try:
            # Try to load the file as a CSV
            dataframe = pd.read_csv(file_path)
            # Check if the dataframe is empty or incorrectly formatted (CSV only now)
            if dataframe.empty:
                raise ValueError(f'File {file_path} is empty or not in the correct format')
            return dataframe
        except pd.errors.ParserError as e:
            # Handle CSV parsing errors
            raise ValueError(f'Failed to parse {file_path} as a CSV file: {e}')
        except Exception as e:
            # Catch any other exceptions that may occur
            raise Exception(f'An unexpected error occurred: {e}')

    def fill_missing_values(self, column_name, method='mode'):
        # Check if a dataframe is loaded or raise error
        if self.dataframe is None:
            raise ValueError('Dataframe is not loaded. Provide a file_path to load dataframe.')
        
        # Check if the specified column exists in the dataframe or raise error
        if column_name not in self.dataframe.columns:
            raise ValueError(f'Column name {column_name} not found in dataframe')
        
        # Check if there are any missing values in the specified column or just return df
        if self.dataframe[column_name].isnull().sum() == 0:
            print(f'No missing values found in column {column_name}. Returning original dataframe.')
            return self.dataframe

        #Try-excpet block containing different functionalities for specific keywords
        try:
            if method == 'mode':
                fill_value = self.dataframe[column_name].mode()[0]
            elif method == 'mean':
                fill_value = self.dataframe[column_name].mean()
            elif method == 'median':
                fill_value = self.dataframe[column_name].median()
            elif method == 'interpolate':
                self.dataframe[column_name].interpolate(inplace=True)
                return self.dataframe
            else:
                # Handle invalid method argument
                raise ValueError(f'Invalid method: {method}. Supported methods are "mode", "mean", "median", "interpolate".')
            
            # Fill missing values in the dataframe
            self.dataframe[column_name].fillna(fill_value, inplace=True)
        except Exception as e:
            raise Exception(f'An error occurred while filling missing values: {e}')

        return self.dataframe

In [9]:
# Usage:
if __name__ == "__main__":
    file_path = 'winequality-red.csv'  # Replace with the path to your CSV file

    imputer = DataCleaning(file_path)
    try:
        imputed_df = imputer.fill_missing_values('density')
        print(imputed_df)
    except Exception as e:
        print(f'Error: {e}')

No missing values found in column density. Returning original dataframe.
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.31