In [1]:

# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.cluster import KMeans
from pykalman import KalmanFilter



## Performing data cleaning steps on the second set of GPS and Shock Data

#### Load and Analyze second set GPS Data

In [2]:
import pandas as pd

# Load GPS data with path
with open('./Data/2/GPS/gps_2023-05-10.log', 'r') as file:
    lines = file.readlines()

# Filtering out clean data 
cleaned_data = []
for line in lines:
    line = line.strip()
    if line:
        # Split the line by semicolon ';'
        data_elements = line.split(';')
        # Check if the line has exactly 7 elements and ends with a '/'
        if len(data_elements) == 7 and data_elements[-1].strip() == '/':
            # Append to the cleaned_data
            cleaned_data.append(data_elements[:-1])  # Discard the last element (slash)

# Create a DataFrame from the cleaned data
column_names = ['Date', 'Time', 'Latitude', 'Longitude', 'Speed', 'TrackAngle']
data_gps2 = pd.DataFrame(cleaned_data, columns=column_names)

# Data cleaning and formatting
data_gps2['Latitude'] = data_gps2['Latitude'].str.rstrip('N').astype(float)
data_gps2['Longitude'] = data_gps2['Longitude'].str.rstrip('E').astype(float)
data_gps2['Date'] = pd.to_datetime(data_gps2['Date'], format='%d/%m/%Y')
data_gps2['Time'] = pd.to_datetime(data_gps2['Time'], format='%H:%M:%S').dt.time

# Filter rows where the date is '2023-05-10'
data_gps2 = data_gps2[data_gps2['Date'] == '2023-05-10']

print(data_gps2)




            Date      Time  Latitude  Longitude Speed TrackAngle
3311  2023-05-10  00:00:00   53.9432    10.8564   0.1      75.36
3312  2023-05-10  00:00:01   53.9432    10.8564  0.03      75.36
3313  2023-05-10  00:00:02   53.9432    10.8564  0.02      75.36
3314  2023-05-10  00:00:03   53.9432    10.8564  0.09      75.36
3315  2023-05-10  00:00:04   53.9432    10.8564  0.05      75.36
...          ...       ...       ...        ...   ...        ...
68912 2023-05-10  21:59:54   53.8353    10.5099   0.6     203.21
68913 2023-05-10  21:59:55   53.8353    10.5099  0.22     203.21
68914 2023-05-10  21:59:56   53.8353    10.5099  0.31     203.21
68915 2023-05-10  21:59:57   53.8353    10.5099  0.38     203.21
68916 2023-05-10  21:59:58   53.8353    10.5099  0.45     203.21

[65606 rows x 6 columns]


In [3]:
# Summary statistics
print("Summary Statistics:")
print(data_gps2.describe())

Summary Statistics:
                      Date      Latitude     Longitude
count                65606  65606.000000  65606.000000
mean   2023-05-10 00:00:00     53.872813     10.631215
min    2023-05-10 00:00:00     53.820400     10.494500
25%    2023-05-10 00:00:00     53.835100     10.509800
50%    2023-05-10 00:00:00     53.835200     10.509900
75%    2023-05-10 00:00:00     53.943200     10.856400
max    2023-05-10 00:00:00     53.943300     10.856500
std                    NaN      0.051281      0.164097


#### Load and Analyze second set Shock Data

In [4]:
import pandas as pd

# Replace 'your_data.txt' with the path to your data file
with open('./Data/2/shock/shock_2023-05-10.log', 'r') as file:
    lines = file.readlines()

# Clean and process the lines
cleaned_lines = []
for line_idx, line in enumerate(lines):
    line = line.strip()
    if line:
        # Split the line by semicolon ';'
        data_elements = line.split(';')
        # Check if the line has 8 elements (7 data columns + 1 timestamp)
        if len(data_elements) == 8:
            # Check if all elements in the row are numeric
            if all(pd.to_numeric(pd.Series(data_elements), errors='coerce').notnull()):
                # If all elements are numeric, keep the row
                cleaned_lines.append(data_elements)
            else:
                print(f"Row {line_idx+1} has non-numeric data. Discarding the row.")
        else:
            print(f"Row {line_idx+1} does not have the correct number of columns. Discarding the row.")

# Check if the first column name is 'Timestamp'
if cleaned_lines[0][0] != 'Timestamp':
    # Change the column names accordingly
    column_names = ['Timestamp', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ', 'Temperature']
else:
    column_names = cleaned_lines[0]

# Create a DataFrame from the cleaned data
data_shock2 = pd.DataFrame(cleaned_lines[1:], columns=column_names)

# Convert the 'Timestamp' column to pandas Timestamp type (float64)
data_shock2['Timestamp'] = pd.to_numeric(data_shock2['Timestamp'], errors='coerce')


Row 1 has non-numeric data. Discarding the row.
Row 1994 does not have the correct number of columns. Discarding the row.
Row 8850 does not have the correct number of columns. Discarding the row.
Row 9936 does not have the correct number of columns. Discarding the row.
Row 24329 does not have the correct number of columns. Discarding the row.
Row 346150 does not have the correct number of columns. Discarding the row.


In [5]:
print(data_shock2)

           Timestamp     AccelX     AccelY    AccelZ     GyroX      GyroY  \
0       1.683670e+12   0.002776   0.000516  9.923157  0.009163  -0.003360   
1       1.683670e+12   0.000230   0.000377  9.919678  0.008858  -0.003971   
2       1.683670e+12  -0.000812   0.002910  9.923219  0.009163  -0.003665   
3       1.683670e+12   0.002775   0.001711  9.923110  0.009163  -0.003665   
4       1.683670e+12   0.003819  -0.003213  9.919663  0.009163  -0.003665   
...              ...        ...        ...       ...       ...        ...   
853042  1.683756e+12   0.005431  -0.008674  9.928580  0.009163  -0.003971   
853043  1.683756e+12   0.002988  -0.004009  9.926007  0.009468  -0.003971   
853044  1.683756e+12   0.002942  -0.008904  9.923849  0.009163  -0.004276   
853045  1.683756e+12   0.000597  -0.006398  9.926171  0.008858  -0.003971   
853046  1.683756e+12   0.001769  -0.007651  9.925010  0.009163  -0.003971   

            GyroZ Temperature  
0       -0.009468   36.000000  
1       -0.