In [39]:
import pandas as pd
import numpy as np

In [50]:
class DataPrepKit:
    def __init__(self):
        pass
    #Read file
    def read_file(self, file_format, file_path):
        if file_format == 'csv':
            df = pd.read_csv(file_path)
        elif file_format == 'excel':
            df = pd.read_excel(file_path, engine='openpyxl')
        elif file_format == 'json':
            df = pd.read_json(file_path)
        else:
            print("Invalid file format")

        return df

    #Data Summary
    def data_summary(self, df):
        summary = {}
        summary['Basic Summary'] = df.describe()
        summary['Most Frequent Values'] = df.mode().iloc[0]
        summary['Average Values'] = df.mean(numeric_only=True)

        return summary

    #Handling missing values
    def handle_missing_values(self, data, strategy='drop'):
        if strategy == 'drop':
           cleaned_data = data.dropna()
        elif strategy == 'impute':
           cleaned_data = data.fillna(data.mean())
        elif strategy == 'fill_zero':
           cleaned_data = data.fillna(0)
        elif strategy == 'fill_median':
           cleaned_data = data.fillna(data.median(numeric_only=True))
        else:
           print("Invalid strategy")
     # Check missing values
        missing_values = cleaned_data.isnull().sum()
        print("Number of missing values for each column:\n", missing_values)
        return cleaned_data



    #Categorical Data Encoding
    def encode_categorical_data(self, data):
        encoded_data = pd.get_dummies(data)
        return encoded_data

def main():
    dpk = DataPrepKit()
    file_path = input("Enter your file path: ")
    file_format = input("Enter file format (csv, excel, json): ")
    data = dpk.read_file(file_format, file_path)
    print("Data:\n", data.head())
    print('_____________________________________________________________________')
    summary = dpk.data_summary(data)
    print("Data Summary:\n")
    for key, value in summary.items():
        print(key + ':')
        print(value)
        print()
    print('_____________________________________________________________________')
    cleaned_data_zero = dpk.handle_missing_values(data, strategy='fill_zero')
    print("Cleaned Data (fill_zero strategy):\n", cleaned_data_zero.head())

    cleaned_data_median = dpk.handle_missing_values(data, strategy='fill_median')
    print("Cleaned Data (fill_median strategy):\n", cleaned_data_median.head())

    print('_____________________________________________________________________')
    encoded_data = dpk.encode_categorical_data(data)
    print("Encoded Data:\n", encoded_data.head())

if __name__ == "__main__":
    main()




Enter your file path: /content/sample_data/anscombe.json
Enter file format (csv, excel, json): json
Data:
   Series   X     Y
0      I  10  8.04
1      I   8  6.95
2      I  13  7.58
3      I   9  8.81
4      I  11  8.33
_____________________________________________________________________
Data Summary:

Basic Summary:
               X          Y
count  44.000000  44.000000
mean    9.000000   7.500455
std     3.198837   1.959244
min     4.000000   3.100000
25%     7.000000   6.117500
50%     8.000000   7.520000
75%    11.000000   8.747500
max    19.000000  12.740000

Most Frequent Values:
Series       I
X          8.0
Y         8.84
Name: 0, dtype: object

Average Values:
X    9.000000
Y    7.500455
dtype: float64

_____________________________________________________________________
Number of missing values for each column:
 Series    0
X         0
Y         0
dtype: int64
Cleaned Data (fill_zero strategy):
   Series   X     Y
0      I  10  8.04
1      I   8  6.95
2      I  13  7.58
3