In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(data_path):
    try:
        df = pd.read_csv(data_path)
        if df.empty:
            print("Warning: The dataset is empty.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def eda(df):
    try:
        # Display basic statistics
        print("Basic Statistics:\n", df.describe())

        # Check for missing values
        missing_values = df.isnull().sum()
        print("\nMissing Values:\n", missing_values)

        # Correlation heatmap (only works with numerical data)
        plt.figure(figsize=(12, 8))
        sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
        plt.title("Correlation Heatmap")
        plt.show()

        # Distribution of numerical columns
        num_cols = df.select_dtypes(include=['float64', 'int64']).columns
        for col in num_cols:
            plt.figure(figsize=(8, 5))
            sns.histplot(df[col], kde=True)
            plt.title(f"Distribution of {col}")
            plt.show()

    except Exception as e:
        print(f"Error in EDA: {e}")
        
def preprocess_data(df):
    try:
        if df is None or df.empty:
            print("Error: The DataFrame is empty or None.")
            return None
        
        # Separate columns into numerical and categorical
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
        categorical_cols = df.select_dtypes(include=['object']).columns
        
        # Handling missing values for numerical columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

        # Handling missing values for categorical columns (if any)
        df[categorical_cols] = df[categorical_cols].fillna('Unknown')

        # Apply one-hot encoding to categorical columns
        if len(categorical_cols) > 0:
            print(f"Encoding categorical columns: {categorical_cols}")
            df_encoded = pd.get_dummies(df, drop_first=True)  # Avoiding dummy variable trap
        else:
            df_encoded = df
        
        # Ensure that there is data after preprocessing
        if df_encoded.empty:
            print("Warning: After preprocessing, the dataset is empty.")
        
        return df_encoded
    except Exception as e:
        print(f"Error in preprocessing: {e}")
        return None

def save_preprocessed_data(df, output_path):
    try:
        if df is not None and not df.empty:
            df.to_csv(output_path, index=False)
            print(f"Preprocessed data saved to {output_path}")
        else:
            print("Error: Cannot save an empty or None DataFrame.")
    except Exception as e:
        print(f"Error saving preprocessed data: {e}")

if __name__ == "__main__":
    data_path = '../data/journal_entries.csv'
    output_path = '../data/journal_entries_preprocessed_eda.csv'
    df = load_data(data_path)
    
    if df is not None and not df.empty:
        print("Data Loaded Successfully")
        eda(df)
        df_encoded = preprocess_data(df)
        
        if df_encoded is not None:
            print(f"Preprocessed Data:\n{df_encoded.head()}")
            save_preprocessed_data(df_encoded, output_path)  # Save the preprocessed data
        else:
            print("Preprocessing failed. Please check the error messages above.")
    else:
        print("Data loading failed or the dataset is empty.")


Data Loaded Successfully
Basic Statistics:
              amount      user_id
count   1000.000000  1000.000000
mean    1196.307090  1049.452000
std     1642.072102    29.394825
min      -41.430000  1000.000000
25%      802.030000  1024.000000
50%     1012.930000  1049.000000
75%     1216.140000  1074.000000
max    20000.000000  1099.000000

Missing Values:
 amount          0
vendor          0
account         0
posting_date    0
user_id         0
description     0
dtype: int64
Error in EDA: could not convert string to float: 'Vendor_11'
Encoding categorical columns: Index(['vendor', 'account', 'posting_date', 'description'], dtype='object')
Preprocessed Data:
    amount  user_id  vendor_Vendor_10  vendor_Vendor_11  vendor_Vendor_12  \
0  1149.01     1082             False              True             False   
1   958.52     1099             False              True             False   
2  1117.77     1001             False             False             False   
3   721.24     1021       

<Figure size 1200x800 with 0 Axes>