In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import sys
import pandas as pd

# Define a trace function that logs calls in any file path containing "pandas"
def trace_calls(frame, event, arg):
    if event != 'call':
        return
    code = frame.f_code
    func_name = code.co_name
    filename = code.co_filename
    # Filter to only show calls from files that include 'pandas' in their path
    if 'pandas' in filename:
        print(f'Call: {func_name} in {filename}:{code.co_firstlineno}')
    return trace_calls

# Sample data: a mix of timezone-aware and naive datetime strings
data = [
    '2021-01-01 00:00:00+00:00',  # UTC
    '2021-01-01 01:00:00-05:00',  # Eastern Standard Time (UTC-5)
    '2021-01-02 00:00:00+02:00',  # UTC+2
    '2021-01-03 12:00:00'         # Naive
]

s = pd.Series(data)

# Set the trace function to capture pandas calls
sys.settrace(trace_calls)

# Call to_datetime, which will trigger many internal pandas calls
result = pd.to_datetime(s, format='mixed')

# Disable tracing
sys.settrace(None)

print("\nResult:")
print(result)

In [None]:
import pyspark as psk
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
from nomad.daphmeIO import _naive_to_localized_str, _unix_to_localized_str

In [None]:
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("Tests") \
    .config("spark.driver.memory", "3g") \
    .config("spark.executor.memory", "3g") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
            "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

In [None]:
traj_cols = {'user_id': 'uid',
             'latitude': 'latitude',
             'longitude': 'longitude',
             'timestamp': 'timestamp'}

In [None]:
dataset_path = "s3://catalog-csslab/garden-city-45k/device-level/admin_1_id=GC/admin_2_id=GC.CD/event_start_date_utc=2024-01-13/"

# Read in pandas, create zoned datetime string

In [None]:
df = from_file(dataset_path, format="parquet", traj_cols=traj_cols)

In [None]:
# We artificially change the offset of some rows to 0
df.loc[df.index[:50000],'timezone_offset'] = 0
df.loc[df.index[-50000:], 'timezone_offset'] = 3600

In [None]:
%%time
df['local_datetime'] = _unix_to_localized_str(df.timestamp, df.timezone_offset)

## Parse with naive + offset option

In [None]:
%%time
pd.to_datetime(df.local_datetime, utc=True)

In [None]:
%%time
pd.to_datetime(pd.to_datetime(df.local_datetime, utc=False))

In [None]:
%%time 
_naive_to_localized_str(df.local_datetime, df.timezone_offset)

In [None]:
%%time
localize_from_offset(df.local_datetime, df.timezone_offset)

## Convert to Spark

what happens with datetime format vs string format? what happens to the schema in general? Develop the from_object spark version

In [None]:
df = spark.read.parquet(dataset_path)

In [None]:
first_timestamp = df.select('timestamp').take(1)
len(str(first_timestamp[0]['timestamp']))

In [None]:
_is_traj_df_spark(df)

In [None]:
pd_df = pd.read_csv('./daphme/nomad/data/gc_sample.csv')

In [None]:
(isinstance(pd_df, psk.sql.dataframe.DataFrame))

In [None]:
# Create a spark dataframe from pandas
spark.createDataFrame(pd_df).show()

In [None]:
spark.stop()