In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [2]:
import sys
import pandas as pd

# Define a trace function that logs calls in any file path containing "pandas"
def trace_calls(frame, event, arg):
    if event != 'call':
        return
    code = frame.f_code
    func_name = code.co_name
    filename = code.co_filename
    # Filter to only show calls from files that include 'pandas' in their path
    if 'pandas' in filename:
        print(f'Call: {func_name} in {filename}:{code.co_firstlineno}')
    return trace_calls

# Sample data: a mix of timezone-aware and naive datetime strings
data = [
    '2021-01-01 00:00:00+00:00',  # UTC
    '2021-01-01 01:00:00-05:00',  # Eastern Standard Time (UTC-5)
    '2021-01-02 00:00:00+02:00',  # UTC+2
    '2021-01-03 12:00:00'         # Naive
]

s = pd.Series(data)

# Set the trace function to capture pandas calls
sys.settrace(trace_calls)

# Call to_datetime, which will trigger many internal pandas calls
result = pd.to_datetime(s, format='mixed')

# Disable tracing
sys.settrace(None)

print("\nResult:")
print(result)

Call: __init__ in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\series.py:389
Call: maybe_extract_name in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\indexes\base.py:7688
Call: _instancecheck in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\dtypes\generic.py:42
Call: _check in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\dtypes\generic.py:37
Call: is_hashable in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\dtypes\inference.py:334
Call: maybe_iterable_to_list in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\common.py:301
Call: default_index in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\indexes\api.py:386
Call: _simple_new in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\indexes\range.py:201
Call: _reset_identity in C:\Users\pacob\Documents\nomad-pyspark\Lib\site-packages\pandas\core\indexes\base.py:

  result = pd.to_datetime(s, format='mixed')


In [3]:
import pyspark as psk
from pyspark.sql import SparkSession
import pandas as pd

In [14]:
from nomad.daphmeIO import _naive_to_localized_str, _unix_to_localized_str

In [5]:
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("Tests") \
    .config("spark.driver.memory", "3g") \
    .config("spark.executor.memory", "3g") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
            "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

In [6]:
traj_cols = {'user_id': 'uid',
             'latitude': 'latitude',
             'longitude': 'longitude',
             'timestamp': 'timestamp'}

In [7]:
dataset_path = "s3://catalog-csslab/garden-city-45k/device-level/admin_1_id=GC/admin_2_id=GC.CD/event_start_date_utc=2024-01-13/"

# Read in pandas, create zoned datetime string

In [8]:
df = from_file(dataset_path, format="parquet", traj_cols=traj_cols)

In [9]:
# We artificially change the offset of some rows to 0
df.loc[df.index[:50000],'timezone_offset'] = 0
df.loc[df.index[-50000:], 'timezone_offset'] = 3600

In [16]:
%%time
df['local_datetime'] = _unix_to_localized_str(df.timestamp, df.timezone_offset)

CPU times: total: 1.3 s
Wall time: 1.31 s


## Parse with naive + offset option

In [20]:
%%time
pd.to_datetime(df.local_datetime, utc=True)

CPU times: total: 4.38 s
Wall time: 4.39 s


0        2024-01-13 01:41:00+00:00
1        2024-01-13 01:43:00+00:00
2        2024-01-13 01:44:00+00:00
3        2024-01-13 13:24:00+00:00
4        2024-01-13 13:25:00+00:00
                    ...           
818396   2024-01-13 03:54:00+00:00
818397   2024-01-13 04:10:00+00:00
818398   2024-01-13 04:22:00+00:00
818399   2024-01-13 04:26:00+00:00
818400   2024-01-13 04:30:00+00:00
Name: local_datetime, Length: 818401, dtype: datetime64[ns, UTC]

In [22]:
%%time
pd.to_datetime(pd.to_datetime(df.local_datetime, utc=False))



ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True, at position 50000

In [None]:
%%time 
_naive_to_localized_str(df.local_datetime, df.timezone_offset)

In [None]:
%%time
localize_from_offset(df.local_datetime, df.timezone_offset)

## Convert to Spark

what happens with datetime format vs string format? what happens to the schema in general? Develop the from_object spark version

In [None]:
df = spark.read.parquet(dataset_path)

In [None]:
first_timestamp = df.select('timestamp').take(1)
len(str(first_timestamp[0]['timestamp']))

In [None]:
_is_traj_df_spark(df)

In [None]:
pd_df = pd.read_csv('./daphme/nomad/data/gc_sample.csv')

In [None]:
(isinstance(pd_df, psk.sql.dataframe.DataFrame))

In [None]:
# Create a spark dataframe from pandas
spark.createDataFrame(pd_df).show()

In [None]:
spark.stop()