In [2]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.cluster import KMeans
import pykalman as pk

#### Load and clean GPS Data

In [3]:
# Load the GPS data from the .log file
data_gps1 = pd.read_csv('./Data/1/gps/gps_2023-04-20.log', sep=';')

# Data cleaning and formatting
data_gps1['Latitude'] = data_gps1['Latitude'].str.rstrip('N').astype(float)
data_gps1['Longitude'] = data_gps1['Longitude'].str.rstrip('E').astype(float)
data_gps1['Date'] = pd.to_datetime(data_gps1['Date'], format='%d/%m/%Y')
data_gps1['Time'] = pd.to_datetime(data_gps1['Time'], format='%H:%M:%S').dt.time

# Drop MagenticVariation column as it is not relevant to the problem
data_gps1.drop(columns=['MagneticVariation'], inplace=True)

# Filter rows where the date is '2023-04-20'
data_gps1 = data_gps1[data_gps1['Date'] == '2023-04-20']

print(data_gps1)


            Date      Time  Latitude  Longitude  Speed  TrackAngle
6938  2023-04-20  00:00:00   58.3098    15.1327   2.42      340.11
6939  2023-04-20  00:00:01   58.3098    15.1327   2.90      336.39
6940  2023-04-20  00:00:02   58.3098    15.1326   3.53      348.29
6941  2023-04-20  00:00:03   58.3099    15.1326   4.02      345.83
6942  2023-04-20  00:00:04   58.3099    15.1326   4.54      343.28
...          ...       ...       ...        ...    ...         ...
83740 2023-04-20  21:59:08   59.5635    17.8822   0.66      286.10
83741 2023-04-20  21:59:09   59.5635    17.8822   0.24      286.10
83742 2023-04-20  21:59:10   59.5635    17.8822   0.21      286.10
83743 2023-04-20  21:59:11   59.5635    17.8822   1.39      286.10
83744 2023-04-20  21:59:12   59.5635    17.8822   0.05      286.10

[76807 rows x 6 columns]


In [4]:
# Summary statistics
print("Summary Statistics:")
print(data_gps1.describe())

Summary Statistics:
                      Date      Latitude     Longitude         Speed  \
count                76807  76807.000000  76807.000000  76807.000000   
mean   2023-04-20 00:00:00     59.456417     17.416823      8.643521   
min    2023-04-20 00:00:00     56.903300     14.561400      0.000000   
25%    2023-04-20 00:00:00     59.402400     16.505200      0.260000   
50%    2023-04-20 00:00:00     59.563400     17.882000      0.470000   
75%    2023-04-20 00:00:00     59.563500     17.882100      1.000000   
max    2023-04-20 00:00:00     59.655000     17.927000     55.920000   
std                    NaN      0.261189      0.725303     17.414235   

         TrackAngle  
count  76807.000000  
mean     170.024356  
min        0.000000  
25%       72.460000  
50%      142.770000  
75%      287.450000  
max      359.990000  
std      113.454766  


In [5]:
# Save the DataFrame to a CSV file
data_gps1.to_csv('./CleanData/GPSSet1Data.csv', index=False)

print("DataFrame saved as GPSSet1Data.csv in the CleanDataFolder.")


DataFrame saved as GPSSet1Data.csv in the CleanDataFolder.


#### Load and clean Shock Data

In [6]:
import pandas as pd

# Load the Shock data
data_shock1 = pd.read_csv('./Data/1/shock/shock_2023-04-20.log', sep=';', header=None)

# Add column headers
data_shock1.columns = ['Timestamp', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ', 'Temperature']

# Convert timestamp to datetime format
data_shock1['Timestamp'] = pd.to_datetime(data_shock1['Timestamp'], unit='ns')

# Extract Date and Time from Timestamp
data_shock1['Date'] = data_shock1['Timestamp'].dt.date
data_shock1['Time'] = data_shock1['Timestamp'].dt.strftime('%H:%M:%S')  # Format time as hh:mm:ss

# Drop the original Timestamp column
data_shock1.drop(columns=['Timestamp'], inplace=True)

# Reorder the columns (optional)
data_shock1 = data_shock1[['Date', 'Time', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ', 'Temperature']]

# Display the cleaned data
print(data_shock1)


               Date      Time    AccelX    AccelY     AccelZ     GyroX  \
0        1970-01-01  00:28:01 -0.599752  0.065531  10.419545 -0.013439   
1        1970-01-01  00:28:01 -0.069945  0.000499  10.019079 -0.023518   
2        1970-01-01  00:28:01 -0.445631 -0.054700   9.785857 -0.022907   
3        1970-01-01  00:28:01 -0.442073 -0.130693   9.712997  0.000916   
4        1970-01-01  00:28:01 -0.094992  0.030634   9.786232  0.016799   
...             ...       ...       ...       ...        ...       ...   
1898344  1970-01-01  00:28:02  0.069265 -0.064725   9.843576 -0.000305   
1898345  1970-01-01  00:28:02  0.067974 -0.063557   9.842454 -0.000305   
1898346  1970-01-01  00:28:02  0.070460 -0.067118   9.843538  0.000000   
1898347  1970-01-01  00:28:02  0.065394 -0.063613   9.840264 -0.000305   
1898348  1970-01-01  00:28:02  0.065498 -0.069564   9.841603  0.000000   

            GyroY     GyroZ  Temperature  
0       -0.003665 -0.005803         29.0  
1        0.004581 -0.0076

In [7]:
print(data_shock1.describe())

             AccelX        AccelY        AccelZ         GyroX         GyroY  \
count  1.898349e+06  1.898349e+06  1.898349e+06  1.898349e+06  1.898349e+06   
mean   3.285397e-02 -5.813003e-02  9.845792e+00 -4.584672e-05 -2.074417e-03   
std    5.863487e-01  1.581588e-01  2.549591e-01  3.810172e-02  6.528746e-03   
min   -8.219378e+00 -3.059698e+00 -1.007930e-01 -5.778790e-01 -1.505780e-01   
25%   -1.202300e-02 -6.370400e-02  9.837987e+00 -3.050000e-04 -2.443000e-03   
50%    6.349500e-02 -5.995300e-02  9.841563e+00 -3.050000e-04 -2.138000e-03   
75%    7.016000e-02 -5.045300e-02  9.846917e+00  0.000000e+00 -1.833000e-03   
max    8.624488e+00  9.355698e+00  1.522790e+01  5.729920e-01  1.624900e-01   

              GyroZ   Temperature  
count  1.898349e+06  1.898346e+06  
mean  -6.253358e-03  2.893846e+01  
std    3.031473e-02  4.929150e+00  
min   -3.512470e-01  1.900000e+01  
25%   -6.414000e-03  2.400000e+01  
50%   -6.109000e-03  3.100000e+01  
75%   -6.109000e-03  3.300000e+01  


In [8]:
# Note: From this data we can observe that shock data record year is 1970. And GPS Data record date is 2023. It doesnot make sense.
# Hence converting the date to 2023-04-20

# Convert '1970-01-01' to '2023-04-20'
data_shock1['Date'] = data_shock1['Date'].apply(lambda x: x + pd.DateOffset(days=19467))

# Display the cleaned data
print(data_shock1)


              Date      Time    AccelX    AccelY     AccelZ     GyroX  \
0       2023-04-20  00:28:01 -0.599752  0.065531  10.419545 -0.013439   
1       2023-04-20  00:28:01 -0.069945  0.000499  10.019079 -0.023518   
2       2023-04-20  00:28:01 -0.445631 -0.054700   9.785857 -0.022907   
3       2023-04-20  00:28:01 -0.442073 -0.130693   9.712997  0.000916   
4       2023-04-20  00:28:01 -0.094992  0.030634   9.786232  0.016799   
...            ...       ...       ...       ...        ...       ...   
1898344 2023-04-20  00:28:02  0.069265 -0.064725   9.843576 -0.000305   
1898345 2023-04-20  00:28:02  0.067974 -0.063557   9.842454 -0.000305   
1898346 2023-04-20  00:28:02  0.070460 -0.067118   9.843538  0.000000   
1898347 2023-04-20  00:28:02  0.065394 -0.063613   9.840264 -0.000305   
1898348 2023-04-20  00:28:02  0.065498 -0.069564   9.841603  0.000000   

            GyroY     GyroZ  Temperature  
0       -0.003665 -0.005803         29.0  
1        0.004581 -0.007636         2

In [None]:
# Save the DataFrame to a CSV file
data_shock1.to_csv('./CleanData/ShockSet1Data.csv', index=False)

print("DataFrame saved as ShockSet1Data.csv in the CleanDataFolder.")


DataFrame saved as ShockSet1Data.csv in the CleanDataFolder.


In [8]:
# GPS data set 1 merged with shock data set 1 on basis of 'Date' and 'Time'
data_combined = pd.merge(data_gps1, data_shock1, on=['Date', 'Time'], how='inner')
print(data_combined)

MemoryError: Unable to allocate 1.06 TiB for an array with shape (145806491643,) and data type int64