In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.cluster import KMeans

### Loading and Cleaning GPS Data Set 3

In [2]:
import pandas as pd

# Load the GPS data from the .log file, excluding the line (containing '}')
data_gps3 = pd.read_csv('./Data/3/GPS/gps_2023-05-04.log', sep=';', names=['Date', 'Time', 'Latitude', 'Longitude', 'Speed', 'Track Angle', 'Magnetic Variation'], skipfooter=1, engine='python')

# Data cleaning and formatting
data_gps3['Latitude'] = data_gps3['Latitude'].str.rstrip('N').astype(float)
data_gps3['Longitude'] = data_gps3['Longitude'].str.rstrip('E').astype(float)

# Use try-except block to handle errors while parsing date and time
try:
    data_gps3['Date'] = pd.to_datetime(data_gps3['Date'], format='%d/%m/%Y')
    data_gps3['Time'] = pd.to_datetime(data_gps3['Time'], format='%H:%M:%S').dt.time
except ValueError:
    # If an error occurs while parsing date and time, drop the row
    data_gps3.dropna(inplace=True)

# Drop Magnetic Variation column as it is not relevant to the problem
data_gps3.drop(columns=['Magnetic Variation'], inplace=True)

print(data_gps3)


             Date      Time  Latitude  Longitude  Speed  Track Angle
1      03/05/2023  21:59:59   41.3647    2.17157   1.13       171.97
2      03/05/2023  22:01:01   41.3649    2.17184   0.05       100.92
3      03/05/2023  22:01:02   41.3649    2.17184   0.05       100.92
4      03/05/2023  22:01:03   41.3649    2.17184   1.03       179.14
5      03/05/2023  22:01:04   41.3649    2.17184   0.49       179.14
...           ...       ...       ...        ...    ...          ...
72403  04/05/2023  21:59:42   48.7848    6.13068  27.89       320.88
72404  04/05/2023  21:59:43   48.7849    6.13056  26.61       319.06
72405  04/05/2023  21:59:44   48.7850    6.13042  26.65       316.63
72406  04/05/2023  21:59:45   48.7851    6.13030  26.26       316.96
72407  04/05/2023  21:59:46   48.7852    6.13016  27.73       316.21

[72407 rows x 6 columns]


In [3]:
# Merge 'Date' and 'Time' columns into a single datetime column
data_gps3['Datetime'] = pd.to_datetime(data_gps3['Date'] + ' ' + data_gps3['Time'], format='%d/%m/%Y %H:%M:%S')

# Convert the datetime column to Unix format in milliseconds
data_gps3['UnixTimeStampInMsec'] = (data_gps3['Datetime'] - pd.Timestamp("1970-01-01")) // pd.Timedelta(milliseconds=1)

# Reorder the columns, moving 'UnixTimeInMSec' to the first position
data_gps3 = data_gps3[['UnixTimeStampInMsec', 'Datetime', 'Latitude', 'Longitude', 'Speed', 'Track Angle']]

# Rename column so that column names are in sync in all the datasets
data_gps3 = data_gps3.rename(columns={'Track Angle': 'TrackAngle'})

# Print the DataFrame with Unix timestamps in the first column
print(data_gps3)


       UnixTimeStampInMsec            Datetime  Latitude  Longitude  Speed  \
1            1683151199000 2023-05-03 21:59:59   41.3647    2.17157   1.13   
2            1683151261000 2023-05-03 22:01:01   41.3649    2.17184   0.05   
3            1683151262000 2023-05-03 22:01:02   41.3649    2.17184   0.05   
4            1683151263000 2023-05-03 22:01:03   41.3649    2.17184   1.03   
5            1683151264000 2023-05-03 22:01:04   41.3649    2.17184   0.49   
...                    ...                 ...       ...        ...    ...   
72403        1683237582000 2023-05-04 21:59:42   48.7848    6.13068  27.89   
72404        1683237583000 2023-05-04 21:59:43   48.7849    6.13056  26.61   
72405        1683237584000 2023-05-04 21:59:44   48.7850    6.13042  26.65   
72406        1683237585000 2023-05-04 21:59:45   48.7851    6.13030  26.26   
72407        1683237586000 2023-05-04 21:59:46   48.7852    6.13016  27.73   

       TrackAngle  
1          171.97  
2          100.92  
3  

In [4]:
# Save GPS Data Set 3
# Save the DataFrame to a CSV file
data_gps3.to_csv('./CleanData/Set3/GPSSet3Data.csv', index=False)

print("DataFrame saved as GPSSet3Data.csv in the CleanData/Set3/ Folder.")

DataFrame saved as GPSSet3Data.csv in the CleanData/Set3/ Folder.


### Loading and Cleaning Shock Data Set 3

In [5]:
# Load the Shock data
data_shock3 = pd.read_csv('./Data/3/shock/shock_2023-05-04.log', sep=';', header=None)

# Add column headers
data_shock3.columns = ['UnixTimeStampInMsec', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ', 'Temperature']

# Display the cleaned data
print(data_shock3)

        UnixTimeStampInMsec    AccelX    AccelY     AccelZ     GyroX  \
0             1683151200009 -0.004225 -0.008822   9.963903 -0.005192   
1             1683151200093  0.009082  0.001522   9.954070 -0.004887   
2             1683151200176 -0.007552  0.017937   9.945915 -0.004581   
3             1683151200260  0.007993 -0.002378   9.947039 -0.004887   
4             1683151200344  0.000886 -0.006175   9.942305 -0.004887   
...                     ...       ...       ...        ...       ...   
923367        1683237599606 -0.263614 -0.684888  10.483509 -0.011301   
923368        1683237599690  0.292846 -0.648380  10.558532  0.005803   
923369        1683237599774 -0.310100 -0.803945  10.072253 -0.024435   
923370        1683237599858  0.314995 -0.718664   9.313916  0.003054   
923371        1683237599942  0.243990 -0.744337   9.984593  0.009468   

           GyroY     GyroZ  Temperature  
0      -0.003665 -0.006414         41.0  
1      -0.003971 -0.006414         41.0  
2      -0

In [6]:
# Save Shock Data for Set 3
# Save the DataFrame to a CSV file
data_shock3.to_csv('./CleanData/Set3/ShockSet3Data.csv', index=False)

print("DataFrame saved as ShockSet3Data.csv in the CleanData/Set3/ Folder.")

DataFrame saved as ShockSet3Data.csv in the CleanData/Set3/ Folder.


### Merging GPS and Shock Data Set 3 for further analysis

In [7]:
# GPS data set 1 merged with Shock Data set 1 on basis of 'Date' and 'Time'
combined_data_set3 = pd.merge(data_gps3, data_shock3, on=['UnixTimeStampInMsec'], how='inner')
print(combined_data_set3)

     UnixTimeStampInMsec            Datetime  Latitude  Longitude  Speed  \
0          1683151309000 2023-05-03 22:01:49   41.3650    2.17171   0.85   
1          1683151374000 2023-05-03 22:02:54   41.3650    2.17180   0.07   
2          1683151506000 2023-05-03 22:05:06   41.3649    2.17176   0.09   
3          1683151562000 2023-05-03 22:06:02   41.3649    2.17176   0.41   
4          1683151610000 2023-05-03 22:06:50   41.3650    2.17176   0.03   
..                   ...                 ...       ...        ...    ...   
882        1683237213000 2023-05-04 21:53:33   48.7559    6.09115  43.97   
883        1683237223000 2023-05-04 21:53:43   48.7569    6.09374  41.43   
884        1683237311000 2023-05-04 21:55:11   48.7604    6.11211  29.53   
885        1683237365000 2023-05-04 21:56:05   48.7618    6.12292  27.78   
886        1683237410000 2023-05-04 21:56:50   48.7632    6.13128  25.94   

     TrackAngle    AccelX    AccelY     AccelZ     GyroX     GyroY     GyroZ  \
0      

In [8]:
print(combined_data_set3.columns)

Index(['UnixTimeStampInMsec', 'Datetime', 'Latitude', 'Longitude', 'Speed',
       'TrackAngle', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ',
       'Temperature'],
      dtype='object')


In [9]:
# Save the DataFrame to a CSV file
combined_data_set3.to_csv('./CleanData/Set3/MergedDataSet3.csv', index=False)

print("DataFrame saved as MergedDataSet3.csv in the CleanData/Set3 Folder.")

DataFrame saved as MergedDataSet3.csv in the CleanData/Set3 Folder.
