In [12]:
import pandas as pd
import locale
from datetime import datetime

# Function definitions to do various tasks
def translate_activity(activity_type):
    if activity_type == "Laufen":
        activity_type = "Running"
    return activity_type


def create_date_obj(date_str):
    # Setting locale to German
    locale.setlocale(locale.LC_TIME, 'deu_deu')
    # Converting the string to a datetime object
    date_obj = datetime.strptime(date_str, "%a, %b %d, %Y %H:%M")
    return date_obj


def convert_device_name(device_str):
    english_device_name = "Vivoactive HR " + device_str.split()[-1]
    return english_device_name


def remove_text_from_speed(speed_str):
    speed_in_minutes_per_kilometer = speed_str.split()[0]
    return speed_in_minutes_per_kilometer

# Importing the csv
df = pd.read_csv('activities_raw_kaggle.csv')

# Checkinf the columns in the raw data file
print(f"Columns in the initial csv:{len(df.columns)}")
for column in df.columns:
    print(column)

# Calling functions wherever needed
df["Activity Type"] = df["Activity Type"].apply(translate_activity)
df["Begin Timestamp"] = df["Begin Timestamp"].apply(create_date_obj)
df["End Timestamp"] = df["End Timestamp"].apply(create_date_obj)
df["Average Speed"] = df["Average Speed"].apply(remove_text_from_speed)
df["Device"] = df["Device"].apply(convert_device_name)

# Selecting the required columns, ignoring the the columns that are repeated

# WIP - column_of_interest_user = ["user_id", "name", "age", "gender", "height", "weight"]

column_of_interest_activity = ["Activity ID", "Activity Type", "Begin Timestamp", "End Timestamp", "Max. Elevation (Raw)",
                               "Min. Elevation (Raw)", "Elevation Gain (Raw)", "Elevation Loss (Raw)",
                               "Average Heart Rate (bpm)", "Max. Heart Rate (bpm).1", "Average Moving Speed",
                               "Average Speed", "Max. Speed", "Distance (Raw)", "Duration (h:m:s)",
                               "Moving Duration (h:m:s)", "Calories"]

column_of_interest_geo = ["Device", "Begin Latitude (Decimal Degrees Raw)", "Begin Longitude (Decimal Degrees Raw)",
                          "End Latitude (Decimal Degrees Raw)", "End Longitude (Decimal Degrees Raw)",
                          "Temperature (Raw)", "Wind Speed (Raw)", "Wind Direction", "Humidity (Raw)", "Condition",
                          "Rainfall"]

# Creating separate DataFrames for each set of columns
df_activity = df[column_of_interest_activity]
df_geo = df[column_of_interest_geo]

# Concatenating the DataFrames vertically
df_combined = pd.concat([df_activity, df_geo], axis=1)

# Differentiating numeric columns and objects
numeric_cols = df_combined.select_dtypes(include='number').columns
object_cols = df_combined.select_dtypes(include='object').columns


# Filling numeric columns with the mean
lat_lon = ["Begin Latitude (Decimal Degrees Raw)", "Begin Longitude (Decimal Degrees Raw)",
           "End Latitude (Decimal Degrees Raw)", "End Longitude (Decimal Degrees Raw)"]
for column_name in numeric_cols:
    if column_name in lat_lon:
        mean_value = df_combined[column_name].mean()
        df_combined[column_name] = df_combined[column_name].fillna(mean_value)
    else:
        mean_value = df_combined[column_name].mean()
        rounded_mean_value = round(mean_value)
        df_combined[column_name] = df_combined[column_name].fillna(rounded_mean_value)

# Filling object columns with the mode
df_combined[object_cols] = df_combined[object_cols].apply(lambda x: x.fillna(x.mode()[0]))

print(f"\n \nColumns in the new csv:{len(df_combined.columns)}")
for column in df_combined.columns:
    print(column)

omitted_columns = []
for column in df.columns:
    if column not in df_combined.columns:
        omitted_columns.append(column)

#n_omitted = count(omitted_columns)
print(f"\n \nOmitted columns:{len(omitted_columns)}")
for column in omitted_columns:
    print(column)



# Saving the combined DataFrame to a single CSV file
df_combined.to_csv('activities_cleaned.csv', index=False)


Columns in the initial csv:47
Activity ID
Activity Name
Description
Begin Timestamp
Begin Timestamp (Raw Milliseconds)
End Timestamp
End Timestamp (Raw Milliseconds)
Device
Activity Parent
Activity Type
Event Type
Activity Time Zone
Max. Elevation
Max. Elevation (Raw)
Begin Latitude (Decimal Degrees Raw)
Begin Longitude (Decimal Degrees Raw)
End Latitude (Decimal Degrees Raw)
End Longitude (Decimal Degrees Raw)
Average Moving Speed
Average Moving Speed (Raw)
Max. Heart Rate (bpm)
Average Heart Rate (bpm)
Max. Speed
Max. Speed (Raw)
Calories
Calories (Raw)
Duration (h:m:s)
Duration (Raw Seconds)
Moving Duration (h:m:s)
Moving Duration (Raw Seconds)
Average Speed
Average Speed (Raw)
Distance
Distance (Raw)
Max. Heart Rate (bpm).1
Min. Elevation
Min. Elevation (Raw)
Elevation Gain
Elevation Gain (Raw)
Elevation Loss
Elevation Loss (Raw)
Temperature (Raw)
Wind Speed (Raw)
Wind Direction
Humidity (Raw)
Condition
Rainfall

 
Columns in the new csv:28
Activity ID
Activity Type
Begin Timestamp