# Data Cleaning of Data and preparing for merging

### Helper funcitons

In [1]:
import pandas as pd

# Function to load a CSV file
def load_csv_file(file_path):
    """Loads a CSV file into a pandas DataFrame and returns it."""
    try:
        df = pd.read_csv(file_path)
        # print(f"Successfully loaded '{file_path}'. Head of the DataFrame:")
        # print(df.head())
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please check the path.")
        return None
    except Exception as e:
        print(f"An error occurred while loading the CSV: {e}")
        return None

# Funtion to check Null Values in CSV
def check_null_values(df, df_name):
    """Checks and prints the sum of null values for each column in a DataFrame."""
    print(f"\n--- Null values in {df_name} ---")
    null_counts = df.isnull().sum()
    if null_counts.sum() == 0:
        print(f"No null values found in {df_name}.")
    else:
        print(null_counts[null_counts > 0])

### Load CSV-files

In [2]:
import pandas as pd
# Load CSV files
# load locations.csv with current opening hours already added manually (from Jessy)
location_df = load_csv_file('data/' + 'locations.csv')
client_df = load_csv_file('exports/' + 'wlan_clients.csv')
manual_df = load_csv_file('exports/' + 'manual_counts.csv')
estimate_df = load_csv_file('exports/' + 'seat_estimates.csv')

In [3]:
location_df = location_df.rename(columns={"id": "location_id"})

location_df.head()

Unnamed: 0,location_id,timestamp,name,longname,url,building,level,room,geocoordinates,availableseats,weeklyopeninghours,wlanaccesspoints,ethernetswitches,superlocation_id,Unnamed: 14,openingWo,closingWo,openingWe,closingWe
0,16,2016-01-01,00:00:00,UBWZB,"Alte Waschhalle, hinten",http://tuebingen.artec-berlin.de/?lang=de_DE&m...,,,,48.52423,9.06268,31.0,"C:23:""Leitsystem\\OpeningHours"":1753:{{""base_t...",/^br21-EG-Hintereingang-[0-9]+$/,,08:00:00,00:00:00,10:00:00,22:00:00
1,15,2016-01-01,00:00:00,UBWZA,"Alte Waschhalle, vorne",http://tuebingen.artec-berlin.de/?lang=de_DE&m...,,,,48.5244,9.06227,101.0,"C:23:""Leitsystem\\OpeningHours"":1753:{{""base_t...",/^br21-EG-Lesesaal3-[0-9]+|br21-OG-Lesesaal1-[...,,08:00:00,00:00:00,10:00:00,22:00:00
2,4,2016-01-01,00:00:00,UBA3A,"Ammerbau, Ebene 3, Links",http://tuebingen.artec-berlin.de/?lang=de_DE&m...,,3.0,,48.52492,9.0631,29.0,"C:23:""Leitsystem\\OpeningHours"":1753:{{""base_t...",/^br25-3E-links-[0-9]+$/,,08:00:00,00:00:00,10:00:00,22:00:00
3,5,2016-01-01,00:00:00,UBA3C,"Ammerbau, Ebene 3, Rechts",http://tuebingen.artec-berlin.de/?lang=de_DE&m...,,3.0,,48.52455,9.06278,158.0,"C:23:""Leitsystem\\OpeningHours"":1753:{{""base_t...",/^br25-3E-rechts-[0-9]+|br25-3E-hintenrechts-[...,,08:00:00,00:00:00,10:00:00,22:00:00
4,6,2016-01-01,00:00:00,UBA4A,"Ammerbau, Ebene 4, Links",http://tuebingen.artec-berlin.de/?lang=de_DE&m...,,4.0,,48.52492,9.0631,42.0,"C:23:""Leitsystem\\OpeningHours"":1753:{{""base_t...",/^br25-4E-links-[0-9]+|br25-4E-hintenlinks-[0-...,,08:00:00,00:00:00,10:00:00,22:00:00


## Unified Opening and Closing Time

The opening and closing time on weekdays and wekends are mentioned in location.csv which are fetched according to the day.


In [4]:
import numpy as np

# Ensure 'timestamp' in client_df is datetime
client_df['timestamp'] = pd.to_datetime(client_df['timestamp'])

# Extract day of the week (Monday=0, Sunday=6)
client_df['weekday'] = client_df['timestamp'].dt.dayofweek

# Merge client_df with location_df to get opening/closing hours
# Select only the necessary columns from location_df for the merge
location_hours_df = location_df[['location_id', 'openingWo', 'closingWo', 'openingWe', 'closingWe']]
client_df = client_df.merge(location_hours_df, on='location_id', how='left')

# Initialize new columns
client_df['Opening'] = pd.NaT
client_df['Closing'] = pd.NaT

client_df.columns


Index(['id', 'location_id', 'numberofclients', 'timestamp', 'weekday',
       'openingWo', 'closingWo', 'openingWe', 'closingWe', 'Opening',
       'Closing'],
      dtype='object')

In [5]:
client_df.head()

Unnamed: 0,id,location_id,numberofclients,timestamp,weekday,openingWo,closingWo,openingWe,closingWe,Opening,Closing
0,5,18,32,2016-04-12 13:01:09,1,08:00:00,19:45:00,00:00:00,00:00:00,NaT,NaT
1,6,17,37,2016-04-12 13:01:09,1,08:00:00,19:45:00,00:00:00,00:00:00,NaT,NaT
2,7,19,14,2016-04-12 13:01:09,1,08:00:00,19:45:00,00:00:00,00:00:00,NaT,NaT
3,8,20,42,2016-04-12 13:01:09,1,00:00:00,00:00:00,00:00:00,00:00:00,NaT,NaT
4,9,15,15,2016-04-12 13:01:09,1,08:00:00,00:00:00,10:00:00,22:00:00,NaT,NaT


In [6]:
import datetime as dt

# Apply conditional logic for 'Opening' and 'Closing' hours
# Weekdays (Monday to Friday, dayofweek 0-4)
client_df.loc[client_df['weekday'] < 5, 'Opening'] = client_df['openingWo']
client_df.loc[client_df['weekday'] < 5, 'Closing'] = client_df['closingWo']

client_df.loc[client_df['weekday'] >= 5, 'Opening'] = client_df['openingWe']
client_df.loc[client_df['weekday'] >= 5, 'Closing'] = client_df['closingWe']

client_df['Opening'] = pd.to_datetime(client_df['Opening']).dt.time
client_df['Closing'] = pd.to_datetime(client_df['Closing']).dt.time

# changing midnight closing hours from 00.00 to 23.59999.. so python interprets it as end of the day (not the beginning)
midnight = dt.time(0, 0)
end_of_day = dt.time(23, 59, 59, 999999)
client_df['Closing_adj'] = client_df['Closing'].where(client_df['Closing'] != midnight, end_of_day)

# Drop the temporary 'weekday' column and the merged hour columns if not needed for future analysis
client_df = client_df.drop(columns=['weekday', 'openingWo', 'closingWo', 'openingWe', 'closingWe'])

print("New 'Opening' and 'Closing' columns added to client_df based on day of the week.")
print(client_df[['timestamp', 'Opening', 'Closing', 'Closing_adj', 'location_id']].head())

New 'Opening' and 'Closing' columns added to client_df based on day of the week.
            timestamp   Opening   Closing      Closing_adj  location_id
0 2016-04-12 13:01:09  08:00:00  19:45:00         19:45:00           18
1 2016-04-12 13:01:09  08:00:00  19:45:00         19:45:00           17
2 2016-04-12 13:01:09  08:00:00  19:45:00         19:45:00           19
3 2016-04-12 13:01:09  00:00:00  00:00:00  23:59:59.999999           20
4 2016-04-12 13:01:09  08:00:00  00:00:00  23:59:59.999999           15


In [7]:
#  save the client_df as csv with updated opening and closing.

try:
    client_df.to_csv("data/wlan_clients_with_opening_hours.csv", index=False)
    print("Saved data.csv")
except Exception:
    print("Save failed")

Saved data.csv


# Task
Remove rows from the `client_df` where the `timestamp` is outside of the `Opening` and `Closing` hours and save these removed rows to a new CSV file named `deleted_client.csv`.

## Prepare client_df for time comparison

### Subtask:
Convert 'Opening' and 'Closing' columns to datetime.time objects and extract the time component from the 'timestamp' column.


**Reasoning**:
To prepare the `client_df` for time comparison, I will convert the 'Opening' and 'Closing' columns to `datetime.time` objects and extract the time component from the 'timestamp' column into a new 'event_time' column.



**Reasoning**:
The previous code produced a UserWarning about `pd.to_datetime` not being able to infer the format for the 'Closing' column. To prevent this warning and ensure consistent parsing, I will explicitly provide the format string `%H:%M:%S` to `pd.to_datetime` when converting 'Opening' and 'Closing' columns to time objects.



In [8]:
client_df['event_time'] = client_df['timestamp'].dt.time

print("Converted 'Opening' and 'Closing' to time objects and extracted 'event_time' from 'timestamp'.")
print(client_df[['Opening', 'Closing_adj', 'event_time']].head())

Converted 'Opening' and 'Closing' to time objects and extracted 'event_time' from 'timestamp'.
    Opening      Closing_adj event_time
0  08:00:00         19:45:00   13:01:09
1  08:00:00         19:45:00   13:01:09
2  08:00:00         19:45:00   13:01:09
3  00:00:00  23:59:59.999999   13:01:09
4  08:00:00  23:59:59.999999   13:01:09


**Reasoning**:
Now that the 'Opening', 'Closing', and 'event_time' columns are correctly formatted as time objects, I will identify the rows where 'event_time' falls outside the 'Opening' and 'Closing' hours. These identified rows will be stored in a new DataFrame called `deleted_client_df`.



In [9]:
deleted_client_df = client_df[
    (client_df['event_time'] < client_df['Opening']) |
    (client_df['event_time'] > client_df['Closing_adj'])
]

print(f"Number of rows to be deleted: {len(deleted_client_df)}")
print("Head of deleted_client_df:")
print(deleted_client_df.head())

Number of rows to be deleted: 6249534
Head of deleted_client_df:
        id  location_id  numberofclients           timestamp   Opening  \
1615  1620           18                0 2016-04-12 19:47:18  08:00:00   
1616  1621           17                1 2016-04-12 19:47:18  08:00:00   
1617  1622           19                0 2016-04-12 19:47:18  08:00:00   
1634  1639           18                0 2016-04-12 19:53:16  08:00:00   
1635  1640           17                1 2016-04-12 19:53:16  08:00:00   

       Closing Closing_adj event_time  
1615  19:45:00    19:45:00   19:47:18  
1616  19:45:00    19:45:00   19:47:18  
1617  19:45:00    19:45:00   19:47:18  
1634  19:45:00    19:45:00   19:53:16  
1635  19:45:00    19:45:00   19:53:16  


**Reasoning**:
Now that the rows to be deleted are identified and stored in `deleted_client_df`, I will remove these rows from the original `client_df` to complete the first part of the task. This will create a `client_df` that only contains entries within the operating hours.



In [10]:
client_df = client_df[
    (client_df['event_time'] >= client_df['Opening']) &
    (client_df['event_time'] <= client_df['Closing_adj'])
]

print(f"Number of rows remaining in client_df: {len(client_df)}")
print("Head of updated client_df:")
print(client_df.head())

Number of rows remaining in client_df: 11239561
Head of updated client_df:
   id  location_id  numberofclients           timestamp   Opening   Closing  \
0   5           18               32 2016-04-12 13:01:09  08:00:00  19:45:00   
1   6           17               37 2016-04-12 13:01:09  08:00:00  19:45:00   
2   7           19               14 2016-04-12 13:01:09  08:00:00  19:45:00   
3   8           20               42 2016-04-12 13:01:09  00:00:00  00:00:00   
4   9           15               15 2016-04-12 13:01:09  08:00:00  00:00:00   

       Closing_adj event_time  
0         19:45:00   13:01:09  
1         19:45:00   13:01:09  
2         19:45:00   13:01:09  
3  23:59:59.999999   13:01:09  
4  23:59:59.999999   13:01:09  


**Reasoning**:
The subtask requires saving the identified `deleted_client_df` to a new CSV file named `wlan_clients_deleted_data.csv`. This action will complete the task of removing rows and saving them separately.



In [11]:
# save client_df with the data only during opening hours 
try:
    client_df.to_csv("data/wlan_clients_during_opening_hours.csv", index=False)
    print("client data saved ")
except Exception:
    print("Save failed")

# save deleted_client_df with the data outside of opening hours
try:
    deleted_client_df.to_csv("data/wlan_clients_deleted_data.csv", index=False)
    print("deleted data saved ")
except Exception:
    print("Save failed")

client data saved 
deleted data saved 


### As of now I have taken General Opening and closing time, please update it in the CSV file with actual closing and opening time.

## Summary:

### Data Analysis Key Findings
*   A total of 6249534 rows were identified where the `timestamp` fell outside the 'Opening' and 'Closing' hours.
*   After removing these out-of-hours entries, the `client_df` was updated to contain 11239561 rows.
*   Converting 'Opening' and 'Closing' time strings to `datetime.time` objects required explicitly specifying the format as `'%H:%M:%S'` to avoid warnings and ensure correct parsing.




In [12]:
# Check null values for each DataFrame

check_null_values(location_df, 'location_df')
check_null_values(client_df, 'client_df')
check_null_values(manual_df, 'manual_df')
check_null_values(estimate_df, 'estimate_df')


--- Null values in location_df ---
level             16
room               2
geocoordinates    19
availableseats     1
Unnamed: 14       19
dtype: int64

--- Null values in client_df ---
No null values found in client_df.

--- Null values in manual_df ---
No null values found in manual_df.

--- Null values in estimate_df ---
No null values found in estimate_df.


In [13]:
print("Checking for duplicate entries based on 'location_id' and 'timestamp':\n")

# Check client_df for duplicates
duplicate_clients = client_df[client_df.duplicated(subset=['location_id', 'timestamp'], keep=False)]
if not duplicate_clients.empty:
    print("Duplicate entries found in client_df:")
    print(duplicate_clients.sort_values(by=['location_id', 'timestamp']).head())
    print(f"Total duplicate rows in client_df: {len(duplicate_clients)}")
else:
    print("No duplicate entries found in client_df based on 'location_id' and 'timestamp'.")
print("\n" + "-" * 50 + "\n")

# Check manual_df for duplicates
duplicate_manuals = manual_df[manual_df.duplicated(subset=['location_id', 'timestamp'], keep=False)]
if not duplicate_manuals.empty:
    print("Duplicate entries found in manual_df:")
    print(duplicate_manuals.sort_values(by=['location_id', 'timestamp']).head())
    print(f"Total duplicate rows in manual_df: {len(duplicate_manuals)}")
else:
    print("No duplicate entries found in manual_df based on 'location_id' and 'timestamp'.")
print("\n" + "-" * 50 + "\n")

# Check estimate_df for duplicates
duplicate_estimates = estimate_df[estimate_df.duplicated(subset=['location_id', 'timestamp'], keep=False)]
if not duplicate_estimates.empty:
    print("Duplicate entries found in estimate_df:")
    print(duplicate_estimates.sort_values(by=['location_id', 'timestamp']).head())
    print(f"Total duplicate rows in estimate_df: {len(duplicate_estimates)}")
else:
    print("No duplicate entries found in estimate_df based on 'location_id' and 'timestamp'.")

Checking for duplicate entries based on 'location_id' and 'timestamp':

No duplicate entries found in client_df based on 'location_id' and 'timestamp'.

--------------------------------------------------

No duplicate entries found in manual_df based on 'location_id' and 'timestamp'.

--------------------------------------------------

No duplicate entries found in estimate_df based on 'location_id' and 'timestamp'.


----

# Cleaning Seat Estimates Data Frame 
- Remove all data outside of opening hours and save to seperate csv if there are any 


In [14]:
# Ensure 'timestamp' in client_df is datetime
estimate_df['timestamp'] = pd.to_datetime(estimate_df['timestamp'])

# Extract day of the week (Monday=0, Sunday=6)
estimate_df['weekday'] = estimate_df['timestamp'].dt.dayofweek

# Merge client_df with location_df to get opening/closing hours
# Select only the necessary columns from location_df for the merge
location_hours_df = location_df[['location_id', 'openingWo', 'closingWo', 'openingWe', 'closingWe']]
estimate_df = estimate_df.merge(location_hours_df, on='location_id', how='left')

# Initialize new columns
estimate_df['Opening'] = pd.NaT
estimate_df['Closing'] = pd.NaT

estimate_df.columns


Index(['id', 'location_id', 'occupiedseats', 'freeseats', 'timestamp',
       'weekday', 'openingWo', 'closingWo', 'openingWe', 'closingWe',
       'Opening', 'Closing'],
      dtype='object')

In [15]:
# Apply conditional logic for 'Opening' and 'Closing' hours
# Weekdays (Monday to Friday, dayofweek 0-4)
estimate_df.loc[estimate_df['weekday'] < 5, 'Opening'] = estimate_df['openingWo']
estimate_df.loc[estimate_df['weekday'] < 5, 'Closing'] = estimate_df['closingWo']

estimate_df.loc[estimate_df['weekday'] >= 5, 'Opening'] = estimate_df['openingWe']
estimate_df.loc[estimate_df['weekday'] >= 5, 'Closing'] = estimate_df['closingWe']


estimate_df['Opening'] = pd.to_datetime(estimate_df['Opening']).dt.time
estimate_df['Closing'] = pd.to_datetime(estimate_df['Closing']).dt.time


# setting correct closing time
midnight = dt.time(0, 0)
end_of_day = dt.time(23, 59, 59, 999999)
estimate_df['Closing_adj'] = estimate_df['Closing'].where(estimate_df['Closing'] != midnight, end_of_day)


# Drop the temporary 'weekday' column and the merged hour columns if not needed for future analysis
estimate_df = estimate_df.drop(columns=['weekday', 'openingWo', 'closingWo', 'openingWe', 'closingWe'])

print("New 'Opening' and 'Closing' columns added to client_df based on day of the week.")
print(estimate_df[['timestamp', 'Opening', 'Closing_adj']].head())

New 'Opening' and 'Closing' columns added to client_df based on day of the week.
            timestamp   Opening      Closing_adj
0 2016-04-25 10:07:11  08:00:00  23:59:59.999999
1 2016-04-25 10:07:11  08:00:00  23:59:59.999999
2 2016-04-25 10:07:11  08:00:00  23:59:59.999999
3 2016-04-25 10:07:11  08:00:00  23:59:59.999999
4 2016-04-25 10:07:11  08:00:00  23:59:59.999999


In [16]:
#  save the estimate_df as csv with updated opening and closing.

try:
    estimate_df.to_csv("data/estimates_with_opening_hours.csv", index=False)
    print("Saved data.csv")
except Exception:
    print("Save failed")

Saved data.csv


In [17]:
# removing all datapoints outside of opening hours and saving them into a new data frame 
#estimate_df['Opening'] = pd.to_datetime(estimate_df['Opening'], format='%H:%M:%S').dt.time
#estimate_df['Closing_adj'] = pd.to_datetime(estimate_df['Closing_adj'], format='%H:%M:%S').dt.time
estimate_df['event_time'] = estimate_df['timestamp'].dt.time

print("Converted 'Opening' and 'Closing' to time objects and extracted 'event_time' from 'timestamp'.")
print(estimate_df[['Opening', 'Closing_adj', 'event_time']].head())

Converted 'Opening' and 'Closing' to time objects and extracted 'event_time' from 'timestamp'.
    Opening      Closing_adj event_time
0  08:00:00  23:59:59.999999   10:07:11
1  08:00:00  23:59:59.999999   10:07:11
2  08:00:00  23:59:59.999999   10:07:11
3  08:00:00  23:59:59.999999   10:07:11
4  08:00:00  23:59:59.999999   10:07:11


In [18]:
# data points outside of opening hours 
deleted_estimate_df = estimate_df[
    (estimate_df['event_time'] < estimate_df['Opening']) |
    (estimate_df['event_time'] > estimate_df['Closing_adj'])
]

print(f"Number of rows to be deleted: {len(deleted_estimate_df)}")
print("Head of deleted_client_df:")
print(deleted_estimate_df.head())

Number of rows to be deleted: 110279
Head of deleted_client_df:
        id  location_id  occupiedseats  freeseats           timestamp  \
1966  1967           18              1         57 2016-04-25 19:47:24   
1967  1968           17              1         51 2016-04-25 19:47:24   
1968  1969           19              0         73 2016-04-25 19:47:24   
1985  1986           18              0         58 2016-04-25 19:53:22   
1986  1987           17              0         52 2016-04-25 19:53:22   

       Opening   Closing Closing_adj event_time  
1966  08:00:00  19:45:00    19:45:00   19:47:24  
1967  08:00:00  19:45:00    19:45:00   19:47:24  
1968  08:00:00  19:45:00    19:45:00   19:47:24  
1985  08:00:00  19:45:00    19:45:00   19:53:22  
1986  08:00:00  19:45:00    19:45:00   19:53:22  


In [19]:
# remaining data points during opening hours 
estimate_df = estimate_df[
    (estimate_df['event_time'] >= estimate_df['Opening']) &
    (estimate_df['event_time'] <= estimate_df['Closing_adj'])
]

print(f"Number of rows remaining in estimate_df: {len(estimate_df)}")
print("Head of updated estimate_df:")
print(estimate_df.head())

Number of rows remaining in estimate_df: 8213343
Head of updated estimate_df:
   id  location_id  occupiedseats  freeseats           timestamp   Opening  \
0   1           16             16         15 2016-04-25 10:07:11  08:00:00   
1   2           15              5         96 2016-04-25 10:07:11  08:00:00   
2   3            6             12         30 2016-04-25 10:07:11  08:00:00   
3   4            7             13         14 2016-04-25 10:07:11  08:00:00   
4   5            8             12         74 2016-04-25 10:07:11  08:00:00   

    Closing      Closing_adj event_time  
0  00:00:00  23:59:59.999999   10:07:11  
1  00:00:00  23:59:59.999999   10:07:11  
2  00:00:00  23:59:59.999999   10:07:11  
3  00:00:00  23:59:59.999999   10:07:11  
4  00:00:00  23:59:59.999999   10:07:11  


In [20]:
# save estimate_df with the data only during opening hours 
try:
    estimate_df.to_csv("data/estimate_during_opening_hours.csv", index=False)
    print("estimate data saved ")
except Exception:
    print("Save failed")

# save deleted_estimate_df with the data outside of opening hours
try:
    deleted_estimate_df.to_csv("data/estimate_deleted_data.csv", index=False)
    print("deleted data saved ")
except Exception:
    print("Save failed")

estimate data saved 
deleted data saved 


----

# Cleaning Manual Count Data Set 
- saving all data points outside of the opening hours into `manual_counts_deleted.csv`


In [21]:
# Ensure 'timestamp' in client_df is datetime
manual_df['timestamp'] = pd.to_datetime(manual_df['timestamp'])

# Extract day of the week (Monday=0, Sunday=6)
manual_df['weekday'] = manual_df['timestamp'].dt.dayofweek

# Merge manual_df with location_df to get opening/closing hours
# Select only the necessary columns from location_df for the merge
location_hours_df = location_df[['location_id', 'openingWo', 'closingWo', 'openingWe', 'closingWe']]
manual_df = manual_df.merge(location_hours_df, on='location_id', how='left')

# Initialize new columns
manual_df['Opening'] = pd.NaT
manual_df['Closing'] = pd.NaT

manual_df.columns


Index(['id', 'location_id', 'occupiedseats', 'freeseats', 'timestamp',
       'weekday', 'openingWo', 'closingWo', 'openingWe', 'closingWe',
       'Opening', 'Closing'],
      dtype='object')

In [22]:
# Apply conditional logic for 'Opening' and 'Closing' hours
# Weekdays (Monday to Friday, dayofweek 0-4)
manual_df.loc[manual_df['weekday'] < 5, 'Opening'] = manual_df['openingWo']
manual_df.loc[manual_df['weekday'] < 5, 'Closing'] = manual_df['closingWo']

manual_df.loc[manual_df['weekday'] >= 5, 'Opening'] = manual_df['openingWe']
manual_df.loc[manual_df['weekday'] >= 5, 'Closing'] = manual_df['closingWe']

manual_df['Opening'] = pd.to_datetime(manual_df['Opening']).dt.time
manual_df['Closing'] = pd.to_datetime(manual_df['Closing']).dt.time

manual_df['Closing_adj'] = manual_df['Closing'].where(manual_df['Closing'] != midnight, end_of_day)



# Drop the temporary 'weekday' column and the merged hour columns if not needed for future analysis
manual_df = manual_df.drop(columns=['weekday', 'openingWo', 'closingWo', 'openingWe', 'closingWe'])

print("New 'Opening' and 'Closing' columns added to manual_df based on day of the week.")
print(manual_df[['timestamp', 'Opening', 'Closing_adj']].head())

New 'Opening' and 'Closing' columns added to manual_df based on day of the week.
            timestamp   Opening      Closing_adj
0 2016-04-22 13:34:23  08:00:00         20:00:00
1 2016-04-25 10:03:55  08:00:00  23:59:59.999999
2 2016-04-25 10:04:11  08:00:00  23:59:59.999999
3 2016-04-25 10:04:28  08:00:00  23:59:59.999999
4 2016-04-25 10:04:37  08:00:00  23:59:59.999999


In [23]:
#  save the manual_df as csv with updated opening and closing.

try:
    manual_df.to_csv("data/manual_counts_with_opening_hours.csv", index=False)
    print("Saved data.csv")
except Exception:
    print("Save failed")

Saved data.csv


In [24]:
# removing all datapoints outside of opening hours and saving them into a new data frame 
#manual_df['Opening'] = pd.to_datetime(manual_df['Opening'], format='%H:%M:%S').dt.time
#manual_df['Closing_adj'] = pd.to_datetime(manual_df['Closing_adj'], format='%H:%M:%S').dt.time
manual_df['event_time'] = manual_df['timestamp'].dt.time

print("Converted 'Opening' and 'Closing' to time objects and extracted 'event_time' from 'timestamp'.")
print(manual_df[['Opening', 'Closing_adj', 'event_time']].head())

Converted 'Opening' and 'Closing' to time objects and extracted 'event_time' from 'timestamp'.
    Opening      Closing_adj event_time
0  08:00:00         20:00:00   13:34:23
1  08:00:00  23:59:59.999999   10:03:55
2  08:00:00  23:59:59.999999   10:04:11
3  08:00:00  23:59:59.999999   10:04:28
4  08:00:00  23:59:59.999999   10:04:37


In [25]:
# data points outside of opening hours 
deleted_manual_df = manual_df[
    (manual_df['event_time'] < manual_df['Opening']) |
    (manual_df['event_time'] > manual_df['Closing_adj'])
]

print(f"Number of rows to be deleted: {len(deleted_manual_df)}")
print("Head of deleted_client_df:")
print(deleted_manual_df.head())

Number of rows to be deleted: 5
Head of deleted_client_df:
      id  location_id  occupiedseats  freeseats           timestamp   Opening  \
66    67            2             22         72 2016-04-25 20:00:15  08:00:00   
93    94            2              7         87 2016-04-25 23:12:54  08:00:00   
177  178            2             20         74 2016-04-26 22:42:38  08:00:00   
253  254            2              7         87 2016-04-27 20:06:28  08:00:00   
397  398            2             11         83 2016-04-29 20:21:30  08:00:00   

      Closing Closing_adj event_time  
66   20:00:00    20:00:00   20:00:15  
93   20:00:00    20:00:00   23:12:54  
177  20:00:00    20:00:00   22:42:38  
253  20:00:00    20:00:00   20:06:28  
397  20:00:00    20:00:00   20:21:30  


In [26]:
# remaining data points during opening hours 
manual_df = manual_df[
    (manual_df['event_time'] >= manual_df['Opening']) &
    (manual_df['event_time'] <= manual_df['Closing_adj'])
]

print(f"Number of rows remaining in manual_df: {len(manual_df)}")
print("Head of updated manual_df:")
print(manual_df.head())

Number of rows remaining in manual_df: 1172
Head of updated manual_df:
   id  location_id  occupiedseats  freeseats           timestamp   Opening  \
0   1            3             20         48 2016-04-22 13:34:23  08:00:00   
1   2            4              6         23 2016-04-25 10:03:55  08:00:00   
2   3            5             26        132 2016-04-25 10:04:11  08:00:00   
3   4           12              2         31 2016-04-25 10:04:28  08:00:00   
4   5           13             12         23 2016-04-25 10:04:37  08:00:00   

    Closing      Closing_adj event_time  
0  20:00:00         20:00:00   13:34:23  
1  00:00:00  23:59:59.999999   10:03:55  
2  00:00:00  23:59:59.999999   10:04:11  
3  00:00:00  23:59:59.999999   10:04:28  
4  00:00:00  23:59:59.999999   10:04:37  


In [27]:
# save manual_df with the data only during opening hours 
try:
    manual_df.to_csv("data/manual_counts_during_opening_hours.csv", index=False)
    print("manual data saved ")
except Exception:
    print("Save failed")

# save deleted_manual_df with the data outside of opening hours
try:
    deleted_manual_df.to_csv("data/manual_counts_deleted_data.csv", index=False)
    print("deleted data saved ")
except Exception:
    print("Save failed")

manual data saved 
deleted data saved 
