In [1]:
import pandas as pd

# Load the datasets
try:
    df_24h = pd.read_csv('./../Data Given for Challenge/data/submission_template_24h.csv')
    df_48h = pd.read_csv('./../Data Given for Challenge/data/submission_template_48h.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Please make sure the CSV files are in the same directory as the script.")
    exit()

# Concatenate the two dataframes
combined_df = pd.concat([df_24h, df_48h], ignore_index=True)

# Convert timestamp to datetime objects
# The format string '%m/%d/%y %H:%M' is used to parse the timestamp correctly.
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'], format='%m/%d/%y %H:%M')

# Sort by timestamp and location
combined_df.sort_values(by=['timestamp', 'location'], inplace=True)

# --- Analysis ---

# 1. Total number of timestamps
total_timestamps = len(combined_df)

# 2. Range of timestamps
min_timestamp = combined_df['timestamp'].min()
max_timestamp = combined_df['timestamp'].max()

# 3. Duplicated values with respect to timestamp and location
duplicates = combined_df[combined_df.duplicated(subset=['timestamp', 'location'], keep=False)]
num_duplicates = len(duplicates)

# --- Output ---

print("--- Timestamp Analysis ---")
print(f"\nTotal number of timestamps: {total_timestamps}")
print(f"\nTimestamp range:")
print(f"  Start: {min_timestamp}")
print(f"  End:   {max_timestamp}")

print(f"\nNumber of duplicated entries (timestamp and location): {num_duplicates}")

if num_duplicates > 0:
    print("\nDuplicated rows:")
    # Using to_string() to make sure all duplicate rows are printed
    print(duplicates.to_string())
else:
    print("\nNo duplicated rows found.")


print("\n--- Sorted Timestamps (first 20 rows) ---")
# Using to_string() to display the first 20 rows in a clean format
print(combined_df.head(20).to_string())

--- Timestamp Analysis ---

Total number of timestamps: 5976

Timestamp range:
  Start: 2023-06-30 01:00:00
  End:   2023-07-02 00:00:00

Number of duplicated entries (timestamp and location): 3984

Duplicated rows:
               timestamp  location  pred
0    2023-06-30 01:00:00     26001   NaN
1992 2023-06-30 01:00:00     26001   NaN
24   2023-06-30 01:00:00     26003   NaN
2040 2023-06-30 01:00:00     26003   NaN
48   2023-06-30 01:00:00     26005   NaN
2088 2023-06-30 01:00:00     26005   NaN
72   2023-06-30 01:00:00     26007   NaN
2136 2023-06-30 01:00:00     26007   NaN
96   2023-06-30 01:00:00     26009   NaN
2184 2023-06-30 01:00:00     26009   NaN
120  2023-06-30 01:00:00     26011   NaN
2232 2023-06-30 01:00:00     26011   NaN
144  2023-06-30 01:00:00     26013   NaN
2280 2023-06-30 01:00:00     26013   NaN
168  2023-06-30 01:00:00     26015   NaN
2328 2023-06-30 01:00:00     26015   NaN
192  2023-06-30 01:00:00     26017   NaN
2376 2023-06-30 01:00:00     26017   NaN
216  

In [10]:
# --- Analysis for each file separately ---

# Convert timestamp columns to datetime objects first for individual analysis
df_24h['timestamp'] = pd.to_datetime(df_24h['timestamp'], format='%m/%d/%y %H:%M')
df_48h['timestamp'] = pd.to_datetime(df_48h['timestamp'], format='%m/%d/%y %H:%M')

# --- Analysis for submission_template_24h.csv ---
print("--- Analysis for submission_template_24h.csv ---")
start_24h = df_24h['timestamp'].min()
end_24h = df_24h['timestamp'].max()
count_24h = len(df_24h)
unique_pairs_24h = df_24h.groupby(['timestamp', 'location']).ngroups

print(f"Starting Timestamp: {start_24h}")
print(f"Ending Timestamp: {end_24h}")
print(f"Total Timestamps (rows): {count_24h}")
print(f"Unique Timestamp-Location Pairs: {unique_pairs_24h}")
print("-" * 50)

# --- Analysis for submission_template_48h.csv ---
print("\n--- Analysis for submission_template_48h.csv ---")
start_48h = df_48h['timestamp'].min()
end_48h = df_48h['timestamp'].max()
count_48h = len(df_48h)
unique_pairs_48h = df_48h.groupby(['timestamp', 'location']).ngroups

print(f"Starting Timestamp: {start_48h}")
print(f"Ending Timestamp: {end_48h}")
print(f"Total Timestamps (rows): {count_48h}")
print(f"Unique Timestamp-Location Pairs: {unique_pairs_48h}")
print("-" * 50)


# --- Analysis for train.csv ---
try:
    df_train = pd.read_csv('./../Data Given for Challenge/data/train.csv')
    
    # Convert timestamp column to datetime objects
    df_train['Timestamp'] = pd.to_datetime(df_train['Timestamp'])

    # --- Analysis ---
    print("\n--- Analysis for train.csv ---")
    start_train = df_train['Timestamp'].min()
    end_train = df_train['Timestamp'].max()
    count_train = len(df_train)
    unique_pairs_train = df_train.groupby(['Timestamp', 'Location']).ngroups

    print(f"Starting Timestamp: {start_train}")
    print(f"Ending Timestamp: {end_train}")
    print(f"Total Timestamps (rows): {count_train}")
    print(f"Unique Timestamp-Location Pairs: {unique_pairs_train}")
    print("-" * 50)

except FileNotFoundError as e:
    print(f"Error: {e}. Please make sure train.csv is in the correct directory.")
except Exception as e:
    print(f"An error occurred: {e}")

--- Analysis for submission_template_24h.csv ---
Starting Timestamp: 2023-06-30 01:00:00
Ending Timestamp: 2023-07-01 00:00:00
Total Timestamps (rows): 1992
Unique Timestamp-Location Pairs: 1992
--------------------------------------------------

--- Analysis for submission_template_48h.csv ---
Starting Timestamp: 2023-06-30 01:00:00
Ending Timestamp: 2023-07-02 00:00:00
Total Timestamps (rows): 3984
Unique Timestamp-Location Pairs: 3984
--------------------------------------------------

--- Analysis for train.csv ---
Starting Timestamp: 2023-04-01 00:00:00
Ending Timestamp: 2023-06-30 00:00:00
Total Timestamps (rows): 179363
Unique Timestamp-Location Pairs: 179363
--------------------------------------------------
