In [3]:
import pandas as pd

class DataPreprocessor:
    def __init__(self, data=None):
        self.data = data
    
    def load_data(self):
        file_path = input("Enter the path to the data file: ")
        file_format = file_path.split(".")[-1]  # Extract file format from the file path

        if file_format == 'csv':
            self.data = pd.read_csv(file_path)
        elif file_format == 'xlsx':
            self.data = pd.read_excel(file_path)
        elif file_format == 'json':
            self.data = pd.read_json(file_path)
        else:
            raise ValueError("Unsupported file format. Please provide a CSV, Excel, or JSON file.")

    def summary_statistics(self):
        return self.data.describe()
    
    def data_type_distribution(self):
        return self.data.dtypes.value_counts()
    
    def unique_value_counts(self):
        return {col: self.data[col].nunique() for col in self.data.columns}
    
    def handle_missing_values(self, method='imputation'):
        if method == 'imputation':
            return self.data.fillna(self.data.mean())  # Example: Impute with mean
        elif method == 'removal':
            return self.data.dropna()
        elif method == 'flagging':
            return self.data.fillna('missing', inplace=True)
        else:
            raise ValueError("Unsupported missing value handling method. Please choose from 'imputation', 'removal', or 'flagging'.")
    
    def encode_categorical_variables(self, method='one-hot'):
        if method == 'one-hot':
            return pd.get_dummies(self.data, columns=self.data.select_dtypes(include='object').columns)
        elif method == 'label':
            # Implement label encoding
            pass
        elif method == 'target':
            # Implement target encoding
            pass
        else:
            raise ValueError("Unsupported categorical variable encoding method. Please choose from 'one-hot', 'label', or 'target'.")
    
    def save_processed_data(self, file_path):
        self.data.to_csv(file_path, index=False)

# Create an instance of DataPreprocessor
preprocessor = DataPreprocessor()

# Load the dataset
preprocessor.load_data()

# Display summary statistics
summary_stats = preprocessor.summary_statistics()
print("Summary Statistics:")
print(summary_stats)
print()

# Display data type distribution
data_type_dist = preprocessor.data_type_distribution()
print("Data Type Distribution:")
print(data_type_dist)
print()

# Display unique value counts
unique_counts = preprocessor.unique_value_counts()
print("Unique Value Counts:")
for col, count in unique_counts.items():
    print(f"{col}: {count}")
print()

# Handle missing values by imputation
clean_data = preprocessor.handle_missing_values(method='imputation')
print("Cleaned Data after Imputation:")
print(clean_data.head())
print()

# Encode categorical variables using one-hot encoding
encoded_data = preprocessor.encode_categorical_variables(method='one-hot')
print("Encoded Data after One-Hot Encoding:")
print(encoded_data.head())
print()

# Save the processed data
processed_file_path = input("Enter the path to save the processed data (include file name with extension): ")
preprocessor.save_processed_data(processed_file_path)
print("Processed data saved successfully.")


Enter the path to the data file: world_population.csv


  return self.data.fillna(self.data.mean())  # Example: Impute with mean


Summary Statistics:
             Rank  2022 Population  2020 Population  2015 Population  \
count  234.000000     2.340000e+02     2.340000e+02     2.340000e+02   
mean   117.500000     3.407441e+07     3.350107e+07     3.172996e+07   
std     67.694165     1.367664e+08     1.355899e+08     1.304050e+08   
min      1.000000     5.100000e+02     5.200000e+02     5.640000e+02   
25%     59.250000     4.197385e+05     4.152845e+05     4.046760e+05   
50%    117.500000     5.559944e+06     5.493074e+06     5.307400e+06   
75%    175.750000     2.247650e+07     2.144798e+07     1.973085e+07   
max    234.000000     1.425887e+09     1.424930e+09     1.393715e+09   

       2010 Population  2000 Population  1990 Population  1980 Population  \
count     2.340000e+02     2.340000e+02     2.340000e+02     2.340000e+02   
mean      2.984524e+07     2.626947e+07     2.271022e+07     1.898462e+07   
std       1.242185e+08     1.116982e+08     9.783217e+07     8.178519e+07   
min       5.960000e+02 