In [1]:

# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.cluster import KMeans
from pykalman import KalmanFilter



# Performing data cleaning steps on the first set of GPS and Shock Data

### Load and Analyze first set of GPS data

In [2]:
# Load the GPS data
data_gps1 = pd.read_csv('./Data/1/gps/gps_2023-04-20.log', sep=';')

# Data unavailable for Magnetic Variation; so dropping the column
data_gps1.drop(columns='MagneticVariation', inplace=True)

# To check how the data looks after being loaded in pandas dataframe
data_gps1.head()



Unnamed: 0,Date,Time,Latitude,Longitude,Speed,TrackAngle
0,19/04/2023,22:02:10,56.9022N,14.5603E,40.51,27.42
1,19/04/2023,22:02:11,56.9023N,14.5604E,40.44,26.85
2,19/04/2023,22:02:12,56.9025N,14.5606E,40.49,26.84
3,19/04/2023,22:02:13,56.9027N,14.5607E,40.21,27.04
4,19/04/2023,22:02:14,56.9028N,14.5609E,40.45,26.91


In [3]:
# To check the number of data rows finally loaded
data_gps1.shape[0]

83745

### Load and Analyze first set of Shock data

In [4]:
# Load the Shock data
data_shock1 = pd.read_csv('./Data/1/shock/shock_2023-04-20.log', sep=';', header=None)

# Add missing column headers
data_shock1.columns = ['Timestamp', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ', 'Temperature']

# Check data after column headers were added
data_shock1.head()

Unnamed: 0,Timestamp,AccelX,AccelY,AccelZ,GyroX,GyroY,GyroZ,Temperature
0,1681941595585,-0.599752,0.065531,10.419545,-0.013439,-0.003665,-0.005803,29.0
1,1681941740571,-0.069945,0.000499,10.019079,-0.023518,0.004581,-0.007636,29.0
2,1681941740617,-0.445631,-0.0547,9.785857,-0.022907,-0.002138,-0.00733,29.0
3,1681941740662,-0.442073,-0.130693,9.712997,0.000916,-0.001833,-0.007025,29.0
4,1681941740708,-0.094992,0.030634,9.786232,0.016799,0.000916,-0.00672,29.0


# Performing data cleaning steps on the second set of GPS and Shock Data

### Load and Analyze second set GPS Data

In [5]:
import pandas as pd

# Load GPS data with path
with open('./Data/2/GPS/gps_2023-05-10.log', 'r') as file:
    lines = file.readlines()

# Filtering out clean data 
cleaned_data = []
for line in lines:
    line = line.strip()
    if line:
        # Split the line by semicolon ';'
        data_elements = line.split(';')
        # Check if the line has exactly 7 elements and ends with a '/'
        if len(data_elements) == 7 and data_elements[-1].strip() == '/':
            # Append to the cleaned_data
            cleaned_data.append(data_elements[:-1])  # Discard the last element (slash)

# Create a DataFrame from the cleaned data
column_names = ['Date', 'Time', 'Latitude', 'Longitude', 'Speed', 'TrackAngle']
data_gps2 = pd.DataFrame(cleaned_data, columns=column_names)

# Check final dataframe structure
data_gps2.head()



Unnamed: 0,Date,Time,Latitude,Longitude,Speed,TrackAngle
0,09/05/2023,22:01:00,53.9432N,10.8564E,0.16,0
1,09/05/2023,22:01:01,53.9432N,10.8564E,0.16,0
2,09/05/2023,22:01:03,53.9432N,10.8564E,0.1,0
3,09/05/2023,22:01:04,53.9432N,10.8564E,0.14,0
4,09/05/2023,22:01:05,53.9432N,10.8564E,0.14,0


In [6]:
# To check the number of records finally filtered out
data_gps2.shape[0]


68917

### Load and Analyze second set Shock Data

In [7]:
# Replace 'your_data.txt' with the path to your data file
with open('./Data/2/shock/shock_2023-05-10.log', 'r') as file:
    lines = file.readlines()

# Clean and process the lines
cleaned_lines = []
for line_idx, line in enumerate(lines):
    line = line.strip()
    if line:
        # Split the line by semicolon ';'
        data_elements = line.split(';')
        # Check if the line has 8 elements (7 data columns + 1 timestamp)
        if len(data_elements) == 8:
            # Check if all elements in the row are numeric
            if all(pd.to_numeric(data_elements, errors='coerce').notnull()):
                # If all elements are numeric, keep the row
                cleaned_lines.append(data_elements)
            else:
                print(f"Row {line_idx+1} has non-numeric data. Discarding the row.")
        else:
            print(f"Row {line_idx+1} does not have the correct number of columns. Discarding the row.")

# Check if the first column name is 'Timestamp'
if cleaned_lines[0][0] != 'Timestamp':
    # Change the column names accordingly
    column_names = ['Timestamp', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ', 'Temperature']
else:
    column_names = cleaned_lines[0]

# Create a DataFrame from the cleaned data
data_shock2 = pd.DataFrame(cleaned_lines[1:], columns=column_names)

# Convert the 'Timestamp' column to numeric (integer)
data_shock2['Timestamp'] = pd.to_numeric(data_shock2['Timestamp'])



AttributeError: 'numpy.ndarray' object has no attribute 'notnull'

In [9]:
import pandas as pd

# Replace 'your_data.txt' with the path to your data file
with open('./Data/2/shock/shock_2023-05-10.log', 'r') as file:
    lines = file.readlines()

# Clean and process the lines
cleaned_lines = []
for line_idx, line in enumerate(lines):
    line = line.strip()
    if line:
        # Split the line by semicolon ';'
        data_elements = line.split(';')
        # Check if the line has 8 elements (7 data columns + 1 timestamp)
        if len(data_elements) == 8:
            # Convert the row to a pandas Series and check if all elements are numeric
            if all(pd.to_numeric(data_elements, errors='coerce').notnull()):
                # If all elements are numeric, keep the row
                cleaned_lines.append(data_elements)
            else:
                print(f"Row {line_idx+1} has non-numeric data. Discarding the row.")
        else:
            print(f"Row {line_idx+1} does not have the correct number of columns. Discarding the row.")

# Check if the first column name is 'Timestamp'
if cleaned_lines[0][0] != 'Timestamp':
    # Change the column names accordingly
    column_names = ['Timestamp', 'AccelX', 'AccelY', 'AccelZ', 'GyroX', 'GyroY', 'GyroZ', 'Temperature']
else:
    column_names = cleaned_lines[0]

# Create a DataFrame from the cleaned data
data_df = pd.DataFrame(cleaned_lines[1:], columns=column_names)

# Convert the 'Timestamp' column to numeric (integer)
data_df['Timestamp'] = pd.to_numeric(data_df['Timestamp'])

# Now you can use the 'data_df' DataFrame for further analysis and processing


AttributeError: 'numpy.ndarray' object has no attribute 'notnull'