In [1]:
import pandas as pd
import numpy as np
import boto3
import os
import datetime


In [2]:
# accessing keys from .env file
AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")


In [3]:
region_name = 'eu-west-1' 
bucket_name = 'csv-files-parquet'
file_key = 'dummy_boat_data.csv'

# Create an S3 client
s3 = boto3.client('s3', 
                  aws_access_key_id=AWS_ACCESS_KEY_ID, 
                  aws_secret_access_key=AWS_SECRET_ACCESS_KEY, 
                  region_name=region_name)

# Read CSV file from S3 into a Pandas DataFrame
try:
    # Use 's3.get_object' to get the object and 'pd.read_csv' to read it into a DataFrame
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    df = pd.read_csv(obj['Body'])
    
    # Now 'df' contains your CSV data
    print(df.head())  # Display the first few rows of the DataFrame
except Exception as e:
    print(f"Error: {e}")

    Timestamp speed_over_ground    Longitude   Latitude engine_fuel_rate
0  1675119600              3.08  -119.605263  35.162237            14.92
1  1675119601              4.11  -118.857776  35.228734            21.02
2  1675119602              4.04  -118.110278  35.295227            20.53
3  1675119603               3.5  -117.362786  35.361725            17.21
4  1675119604             ERROR  -116.615295  35.428215            17.38


In [None]:
def impor_csv(bucket, key):
s3 = boto3.client('s3', 
                  aws_access_key_id=AWS_ACCESS_KEY_ID, 
                  aws_secret_access_key=AWS_SECRET_ACCESS_KEY, 
                  region_name=region_name)

# Read CSV file from S3 into a Pandas DataFrame
try:
    # Use 's3.get_object' to get the object and 'pd.read_csv' to read it into a DataFrame
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    df = pd.read_csv(obj['Body'])
    
    # Now 'df' contains your CSV data
    print(df.head())  # Display the first few rows of the DataFrame
except Exception as e:
    print(f"Error: {e}")

In [4]:
# Testing to write to s3 bucket

# create a sample data frame

books_df = pd.DataFrame(
    data={"Title": ["Book I", "Book II", "Book III"], "Price": [56.6, 59.87,74.54]},
    columns=["Title", "Price"]
)

books_df.head()

Unnamed: 0,Title,Price
0,Book I,56.6
1,Book II,59.87
2,Book III,74.54


In [5]:
# entering the dataframe into another folder on AWS s3

books_df.to_parquet(
    "s3://new-parquet-files/books.parquet",
    index=False,
    storage_options={
        "key": AWS_ACCESS_KEY_ID,
        "secret":AWS_SECRET_ACCESS_KEY
    }
)

In [6]:
df.head()

Unnamed: 0,Timestamp,speed_over_ground,Longitude,Latitude,engine_fuel_rate
0,1675119600,3.08,-119.605263,35.162237,14.92
1,1675119601,4.11,-118.857776,35.228734,21.02
2,1675119602,4.04,-118.110278,35.295227,20.53
3,1675119603,3.5,-117.362786,35.361725,17.21
4,1675119604,ERROR,-116.615295,35.428215,17.38


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Timestamp          990 non-null    object
 1   speed_over_ground  990 non-null    object
 2   Longitude          990 non-null    object
 3   Latitude           990 non-null    object
 4   engine_fuel_rate   990 non-null    object
dtypes: object(5)
memory usage: 39.2+ KB


In [8]:
#class CsvCleaner:
    #@staticmethod
def timestamp_clean(df, col_name):
    # convert the column to numeric with any errors(for example strings or letter) to NaN
    df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
    
    # Interpolate NaN values in the timestamp column
    df[col_name] = df[col_name].interpolate()

    # Calculate the initial number of rows
    initial_rows = df.shape[0]
    
    #Filter out rows with "Timestamp" values not containing 10 digits
    df = df[df[col_name].apply(lambda x: len(str(int(x))) == 10)]

    #calculate how many rows removed
    rows_removed = initial_rows - df.shape[0]
    
    # Print the number of rows removed
    print(f"{rows_removed} rows were removed.")

    # Convert the Unix timestamp to datetime with seconds
    df[col_name] = pd.to_datetime(df[col_name], unit="s")

    # Sort the DataFrame by the timestamp column
    df = df.sort_values(by=col_name)

    return df

In [9]:
cleaned_df = timestamp_clean(df, "Timestamp")

5 rows were removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = pd.to_datetime(df[col_name], unit="s")


In [10]:
cleaned_df

Unnamed: 0,Timestamp,speed_over_ground,Longitude,Latitude,engine_fuel_rate
0,2023-01-30 23:00:00,3.08,-119.605263,35.162237,14.92
1,2023-01-30 23:00:01,4.11,-118.857776,35.228734,21.02
2,2023-01-30 23:00:02,4.04,-118.110278,35.295227,20.53
3,2023-01-30 23:00:03,3.5,-117.362786,35.361725,17.21
4,2023-01-30 23:00:04,ERROR,-116.615295,35.428215,17.38
...,...,...,...,...,...
995,2023-01-30 23:16:35,16.18,89.499749,51.884541,81.06
996,2023-01-30 23:16:36,16.26,89.629043,ERROR,81.49
997,2023-01-30 23:16:37,,89.758342,53.029681,82.7
998,2023-01-30 23:16:38,15.93,89.887628,53.602249,79.4


In [11]:
print(cleaned_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 995 entries, 0 to 999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Timestamp          995 non-null    datetime64[ns]
 1   speed_over_ground  985 non-null    object        
 2   Longitude          985 non-null    object        
 3   Latitude           985 non-null    object        
 4   engine_fuel_rate   985 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 46.6+ KB
None


In [12]:
def clean_columns(df, col_name, low, high):
    
    # Convert column to numeric, making errors to Nan instead
    df[col_name] = pd.to_numeric(df[col_name], errors="coerce")

    # Calculate the initial number of rows
    initial_rows = df.shape[0]

    df = df[(df[col_name] >= low) & (df[col_name] <= high)]

    # Calculate the number of rows removed
    rows_removed = initial_rows - df.shape[0]

    # Print the number of rows removed
    print(f"{rows_removed} rows were removed.")

    df[col_name] = df[col_name].interpolate()

    return df


In [13]:
column = clean_columns(cleaned_df, "Longitude", -180, 180)

25 rows were removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].interpolate()


In [14]:
column.info()

<class 'pandas.core.frame.DataFrame'>
Index: 970 entries, 0 to 999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Timestamp          970 non-null    datetime64[ns]
 1   speed_over_ground  961 non-null    object        
 2   Longitude          970 non-null    float64       
 3   Latitude           960 non-null    object        
 4   engine_fuel_rate   960 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 45.5+ KB


In [15]:
column.loc[(column["Longitude"] < -180) | (column["Longitude"] > 180)]

Unnamed: 0,Timestamp,speed_over_ground,Longitude,Latitude,engine_fuel_rate


In [16]:
print(f"Number of Cells with 'ERROR' in Longitude: {(column['Longitude'] == 'ERROR').sum()}")


Number of Cells with 'ERROR' in Longitude: 0


In [17]:
if column["Longitude"].isnull().any():
    print("There are NaN values in the column.")
else:
    print("There are no NaN values in the column.")

There are no NaN values in the column.


In [18]:
speed_over_ground = clean_columns(column, "speed_over_ground", 0,20)

23 rows were removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].interpolate()


In [19]:
speed_over_ground

Unnamed: 0,Timestamp,speed_over_ground,Longitude,Latitude,engine_fuel_rate
0,2023-01-30 23:00:00,3.08,-119.605263,35.162237,14.92
1,2023-01-30 23:00:01,4.11,-118.857776,35.228734,21.02
2,2023-01-30 23:00:02,4.04,-118.110278,35.295227,20.53
3,2023-01-30 23:00:03,3.50,-117.362786,35.361725,17.21
5,2023-01-30 23:00:05,3.62,-115.867809,35.494706,17.78
...,...,...,...,...,...
994,2023-01-30 23:16:34,16.27,89.370456,51.31197,81.66
995,2023-01-30 23:16:35,16.18,89.499749,51.884541,81.06
996,2023-01-30 23:16:36,16.26,89.629043,ERROR,81.49
998,2023-01-30 23:16:38,15.93,89.887628,53.602249,79.4


In [20]:
speed_over_ground.loc[(speed_over_ground["speed_over_ground"] < 0) | (column["speed_over_ground"] > 20)]

Unnamed: 0,Timestamp,speed_over_ground,Longitude,Latitude,engine_fuel_rate


In [21]:
print(f"Number of Cells with 'ERROR' in speed over ground: {(speed_over_ground['speed_over_ground'] == 'ERROR').sum()}")

if speed_over_ground["speed_over_ground"].isnull().any():
    print("There are NaN values in the column.")
else:
    print("There are no NaN values in the column.")

Number of Cells with 'ERROR' in speed over ground: 0
There are no NaN values in the column.


In [39]:
class CsvCleaner:
    @staticmethod
    def timestamp_clean(df, col_name):
        # convert the column to numeric with any errors(for example strings or letter) to NaN
        df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
        
        # Interpolate NaN values in the timestamp column
        df[col_name] = df[col_name].interpolate()

        # Calculate the initial number of rows
        initial_rows = df.shape[0]
        
        #Filter out rows with "Timestamp" values not containing 10 digits
        df = df[df[col_name].apply(lambda x: len(str(int(x))) == 10)]

        #calculate how many rows removed
        rows_removed = initial_rows - df.shape[0]

        # Convert the Unix timestamp to datetime with seconds
        df[col_name] = pd.to_datetime(df[col_name], unit="s")

        # Sort the DataFrame by the timestamp column
        df = df.sort_values(by=col_name)

        return df, rows_removed
    
    @staticmethod
    def clean_columns(df, col_name, low, high):
    
        # Convert column to numeric, making errors to Nan instead
        df[col_name] = pd.to_numeric(df[col_name], errors="coerce")

        # Calculate the initial number of rows
        initial_rows = df.shape[0]

        df = df[(df[col_name] >= low) & (df[col_name] <= high)]

        # Calculate the number of rows removed
        rows_removed = initial_rows - df.shape[0]

        df[col_name] = df[col_name].interpolate()

        return df, rows_removed
    
    @staticmethod
    def clean_file(csv_file):
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(csv_file)
        total_rows_removed = 0

        # Clean the DataFrame
        for col in df.columns:
            if "Timestamp" in col:
                df, rows_removed = CsvCleaner.timestamp_clean(df, col)
                total_rows_removed += rows_removed
            
            if "speed_over_ground" in col:
                low = 0
                high = 100
                df, rows_removed = CsvCleaner.clean_columns(df, col, low, high)
                total_rows_removed += rows_removed
            
            if "Longitude" in col:
                low = -180
                high = 180
                df, rows_removed = CsvCleaner.clean_columns(df, col, low, high)
                total_rows_removed += rows_removed

            if "Latitude" in col:
                low = -90
                high = 90
                df, rows_removed = CsvCleaner.clean_columns(df, col, low, high)
                total_rows_removed += rows_removed

            if "engine_fuel_rate" in col:
                low = 0
                high = 100
                df,rows_removed = CsvCleaner.clean_columns(df, col, low, high)
                total_rows_removed += rows_removed

        # Save the cleaned DataFrame as a Parquet file
        cleaned_parquet_file = csv_file.replace(".csv", ".parquet")
        df.to_parquet(cleaned_parquet_file, index=False)

        print(f"Total rows removed: {total_rows_removed}")

        return cleaned_parquet_file

In [40]:
cleaned_file = CsvCleaner.clean_file("../src/Data/dummy_boat_data.csv")

Total rows removed: 97


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = pd.to_datetime(df[col_name], unit="s")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].interpolate()


In [41]:
pd.read_parquet(cleaned_file, engine="pyarrow")

Unnamed: 0,Timestamp,speed_over_ground,Longitude,Latitude,engine_fuel_rate
0,2023-01-30 23:00:00,3.08,-119.605263,35.162237,14.92
1,2023-01-30 23:00:01,4.11,-118.857776,35.228734,21.02
2,2023-01-30 23:00:02,4.04,-118.110278,35.295227,20.53
3,2023-01-30 23:00:03,3.50,-117.362786,35.361725,17.21
4,2023-01-30 23:00:05,3.62,-115.867809,35.494706,17.78
...,...,...,...,...,...
898,2023-01-30 23:16:33,16.15,89.241158,50.739405,80.98
899,2023-01-30 23:16:34,16.27,89.370456,51.311970,81.66
900,2023-01-30 23:16:35,16.18,89.499749,51.884541,81.06
901,2023-01-30 23:16:38,15.93,89.887628,53.602249,79.40
