In [1]:
import pandas as pd
import numpy as np

In [4]:
class Read_File:
    def __init__(self):
        pass
    #Read file
    def read_file(self, file_format, file_path):
        if file_format == 'csv':
            df = pd.read_csv(file_path)
        elif file_format == 'excel':
            df = pd.read_excel(file_path, engine='openpyxl')
        elif file_format == 'json':
            df = pd.read_json(file_path)
        else:
            print("Invalid file format")

        return df

    #Data Summary
    def data_summary(self, df):
        summary = {}
        summary['Basic Summary'] = df.describe()
        summary['Most Frequent Values'] = df.mode().iloc[0]
        summary['Average Values'] = df.mean(numeric_only=True)

        return summary

    #Handling missing values
    def handle_missing_values(self, data, strategy='drop'):
        if strategy == 'drop':
           cleaned_data = data.dropna()
        elif strategy == 'impute':
           cleaned_data = data.fillna(data.mean())
        elif strategy == 'fill_zero':
           cleaned_data = data.fillna(0)
        elif strategy == 'fill_median':
           cleaned_data = data.fillna(data.median(numeric_only=True))
        else:
           print("Invalid strategy")
     # Check missing values
        missing_values = cleaned_data.isnull().sum()
        print("Number of missing values for each column:\n", missing_values)
        return cleaned_data



    #Categorical Data Encoding
    def encode_categorical_data(self, data):
        encoded_data = pd.get_dummies(data)
        return encoded_data

def main():
    RF = Read_File()
    file_path = input("Enter your file path: ")
    file_format = input("Enter file format (csv, excel, json): ")
    data = RF.read_file(file_format, file_path)
    print("Data:\n", data.head())
    print('_____________________________________________________________________')
    summary = RF.data_summary(data)
    print("Data Summary:\n")
    for key, value in summary.items():
        print(key + ':')
        print(value)
        print()
    print('_____________________________________________________________________')
    cleaned_data_zero = RF.handle_missing_values(data, strategy='fill_zero')
    print("Cleaned Data (fill_zero strategy):\n", cleaned_data_zero.head())

    cleaned_data_median = RF.handle_missing_values(data, strategy='fill_median')
    print("Cleaned Data (fill_median strategy):\n", cleaned_data_median.head())

    print('_____________________________________________________________________')
    encoded_data = RF.encode_categorical_data(data)
    print("Encoded Data:\n", encoded_data.head())

if __name__ == "__main__":
    main()




Enter your file path: /content/insurance.csv
Enter file format (csv, excel, json): csv
Data:
    age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
_____________________________________________________________________
Data Summary:

Basic Summary:
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      