In [1]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: tzdata, numpy, pandas
Successfully installed numpy-2.0.2 pandas-2.2.3 tzdata-2024.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import os


BASE_PATH = '../../data/'
activity_log_path = os.path.join(BASE_PATH, 'ACTIVITY_LOG.csv')
user_log_path = os.path.join(BASE_PATH, 'USER_LOG.csv')


def load_data(file_path, encoding='utf-8'):
    try:
        data = pd.read_csv(file_path, encoding=encoding)
        print(f"Loaded data from {file_path}")
        return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file {file_path} is empty.")
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
    return None

def merge_data(activity_log, user_log):
    if activity_log is not None and user_log is not None:
        merged = pd.merge(activity_log, user_log, on='User Full Name *Anonymized', how='inner')
        print("Data merged successfully.")
        return merged
    else:
        print("Error: One or more data frames are None, cannot proceed with merge.")
        return None

def clean_data(merged_data):
    if merged_data is not None:
        # Rename
        merged_data.rename(columns={'User Full Name *Anonymized': 'User_ID'}, inplace=True)
        # Remove
        merged_data = merged_data[~merged_data['Component'].isin(['System', 'Folder'])]
        # Drop rows with any missing values
        merged_data.dropna(inplace=True)
        print("Data cleaned successfully.")
        
        return merged_data
    else:
        print("Error: Merged data is None, cannot proceed with cleaning.")
        return None

def clean():
    activity_log = load_data(activity_log_path)
    user_log = load_data(user_log_path)
    merged_data = merge_data(activity_log, user_log)
    cleaned_data = clean_data(merged_data)
    
    # Display the row count instead of saving the data
    if cleaned_data is not None:
        row_count = len(cleaned_data)
        print(f"The cleaned dataset contains {row_count} rows.")
    else:
        print("Could not clean the data due to errors in processing.")


clean()


Loaded data from ../../data/ACTIVITY_LOG.csv
Loaded data from ../../data/USER_LOG.csv
Data merged successfully.
Data cleaned successfully.
The cleaned dataset contains 172322948 rows.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data.dropna(inplace=True)
