In [25]:
import pandas as pd

# Specify the column names to ensure consistency
column_names = ['ep (ms)', 'Acc_x', 'Acc_y', 'Acc_z', 'Gyro_x', 'Gyro_y', 'Gyro_z', 'ID', 'Label', 'Category', 'Set']

# Read the CSV while specifying column names
df = pd.read_csv('data.csv', names=column_names, skiprows=1,sep=';')



In [26]:
df

Unnamed: 0,ep (ms),Acc_x,Acc_y,Acc_z,Gyro_x,Gyro_y,Gyro_z,ID,Label,Category,Set
0,2019-01-11 15:08:05.200,0.0135,0.977,-0.071,-2.094.366.723,257.720.316,0.9388000000000002,B,bench,heavy,30.0
1,2019-01-11 15:08:05.400,-0.0014999999999999996,0.9704999999999999,-0.07949999999999999,-16.826,-0.8904,21.708,B,bench,heavy,30.0
2,2019-01-11 15:08:05.600,0.0013333333333333333,0.9716666666666667,-0.06433333333333334,526.942.212,-0.2559999999999999,-14.146,B,bench,heavy,30.0
3,2019-01-11 15:08:05.800,-0.024,0.957,-0.0735,8.061,-45.244,-2.073,B,bench,heavy,30.0
4,2019-01-11 15:08:06.000,-0.027999999999999997,0.9576666666666666,-0.115,2.439,-15.486,-36.098,B,bench,heavy,30.0
...,...,...,...,...,...,...,...,...,...,...,...
9004,2019-01-20 17:33:27.000,-0.048,-10.415,-0.0765,14.146,-56.218,0.2926,E,row,medium,40.0
9005,2019-01-20 17:33:27.200,-0.037,1.151.994.539,-0.05333333333333334,-27.684,-0.5854,-1.530.757.122,E,row,medium,40.0
9006,2019-01-20 17:33:27.400,-0.06,274.571.263,-0.08199999999999999,-1.846.542.330,-51.342,-0.12200000000000003,E,row,medium,40.0
9007,2019-01-20 17:33:27.600,-0.03866666666666666,-1.295.977.131,-0.04466666666666667,-0.2318,0.2562,-765.378.561,E,row,medium,40.0


In [32]:
# Fill NaN with empty strings, strip whitespace, and convert to numeric
for column in ['Acc_x', 'Acc_y', 'Acc_z', 'Gyro_x', 'Gyro_y', 'Gyro_z', 'Set']:
    df[column] = df[column].fillna('').astype(str).str.strip()  # Strip whitespace
    df[column] = pd.to_numeric(df[column], errors='coerce')     # Convert to numeric

# Check the updated data types
print(df.dtypes)



ep (ms)      object
Acc_x       float64
Acc_y       float64
Acc_z       float64
Gyro_x      float64
Gyro_y      float64
Gyro_z      float64
ID           object
Label        object
Category     object
Set         float64
dtype: object


In [38]:
def calculate_mean(data):
    return sum(data) / len(data)

def calculate_median(data):
    sorted_data = sorted(data)
    n = len(sorted_data)
    mid = n // 2
    
    if n % 2 == 0:
        return (sorted_data[mid - 1] + sorted_data[mid]) / 2
    else:
        return sorted_data[mid]

def calculate_mode(data):
    frequency = {}
    for value in data:
        frequency[value] = frequency.get(value, 0) + 1

    max_frequency = max(frequency.values())
    modes = [key for key, value in frequency.items() if value == max_frequency]
    
    return modes if len(modes) > 1 else modes[0] 

def calculate_central_tendencies(data):
    mean_value = calculate_mean(data)
    median_value = calculate_median(data)
    mode_value = calculate_mode(data)
    
    return {
        'Mean': mean_value,
        'Median': median_value,
        'Mode': mode_value
    }
    
    

In [40]:
column_data = df['Acc_z'].dropna().tolist() 
central_tendencies = calculate_central_tendencies(column_data)

print(central_tendencies)

{'Mean': 0.10777982144180188, 'Median': -0.031, 'Mode': [-0.095, -0.125, -0.12]}


In [41]:
mean_value = df['Acc_z'].mean()  # Example for the Acc_x column
median_value = df['Acc_z'].median()  # Example for the Acc_x column
mode_value = df['Acc_z'].mode()  # Example for the Acc_x column

print(mean_value)
print(median_value)
print(mode_value)

0.10777982144180188
-0.031
0   -0.125
1   -0.120
2   -0.095
Name: Acc_z, dtype: float64


In [42]:
def calculate_quintiles(data):

    
    sorted_data = sorted(data)
    n = len(sorted_data)

    
    q0 = sorted_data[0]  
    q1 = sorted_data[int(n * 0.2)] 
    q2 = sorted_data[int(n * 0.4)]  
    q3 = sorted_data[int(n * 0.6)]  
    q4 = sorted_data[int(n * 0.8)]  
    q5 = sorted_data[-1]  

    return {
        'Q0': q0,
        'Q1': q1,
        'Q2': q2,
        'Q3': q3,
        'Q4': q4,
        'Q5': q5
    }

In [43]:
column_data = df['Acc_z'].dropna().tolist() 

quantiles = calculate_quintiles(column_data)

print(quantiles)

{'Q0': -0.6193333333333334, 'Q1': -0.149, 'Q2': -0.0706666666666666, 'Q3': 0.062, 'Q4': 0.423, 'Q5': 10.175}


In [52]:
import pandas as pd

def display_informations(df, column_name):

    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
    
    total_values = df[column_name].shape[0]  
    missing_values = df[column_name].isnull().sum()  
    percentage_missing = (missing_values / total_values) * 100  
    unique_values = df[column_name].nunique()  
    unique_values_list = df[column_name].unique()  
    # Display results
    print(f"General informations  for '{column_name}':")
    print(f"Number of missing values: {missing_values}")
    print(f"Percentage of missing values: {percentage_missing:.2f}%")
    print(f"Number of unique values: {unique_values}")
    print(f"Unique values: {unique_values_list}")



In [54]:
display_informations(df, 'Acc_z')  # Replace 'Acc_x' with your desired column name


General informations  for 'Acc_z':
Number of missing values: 11
Percentage of missing values: 0.12%
Number of unique values: 3280
Unique values: [-0.071      -0.0795     -0.06433333 ...  0.2385      0.114
  0.098     ]
