In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os


#### Import with ID

In [2]:
def preprocess_the_whole_stage(folder_path):
    # create an empty list to store the DataFrames
    dfs = []

    # loop over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # extract the ID from the filename (assuming the filename is in the format "ID.csv")
            file_id = os.path.splitext(filename)[0]

            # read the CSV file into a DataFrame and add the ID as a new column
            df = pd.read_csv(os.path.join(folder_path, filename))
            df.insert(0, 'ID', file_id)

            # append the DataFrame to the list
            dfs.append(df)

    # concatenate all DataFrames into a single DataFrame
    full_dataset = pd.concat(dfs)
    
    # print the resulting DataFrame
    print(full_dataset.head())
    
    # Add all zero class
    condition = (full_dataset.StartHesitation == 0) & (full_dataset.Turn == 0) & (full_dataset.Walking == 0)
    condition_2 = (full_dataset.StartHesitation == 1) | (full_dataset.Turn == 1) | (full_dataset.Walking == 1)
    
    full_dataset.loc[condition, 'All_zero'] = 1
    full_dataset.loc[condition_2, 'All_zero'] = 0
    print(full_dataset.head())
    
    print("Cleaning the Dataset")
    if 'Valid' in full_dataset.columns:
        remove_col = ['ID','Time', 'Valid', 'Task']
        
    else:
        remove_col = ['ID','Time']
    print(f"The remove columns : {remove_col}")
    clean_dataset = full_dataset.drop(full_dataset[remove_col],axis=1)
    print(clean_dataset.head())
    
    # search duplication
    print(f"Search for Duplication : {clean_dataset.duplicated().sum()}")
    clean_dataset.drop_duplicates(inplace=True)
    print(f"Search for Duplication : {clean_dataset.duplicated().sum()}")
    print(clean_dataset.head())
    
    print("Checking conditon")
    condition = (clean_dataset.StartHesitation == 0) & (clean_dataset.Turn == 0 ) & (clean_dataset.Walking == 0)
    total_zero = clean_dataset[condition].shape[0]
    print(f"Total number where three class are zero: {total_zero}")
    All_zero = clean_dataset[clean_dataset.All_zero == 1].shape[0]
    print(f"Total number of All_zero class: {All_zero}")
    print(f"Is all zero and Total number zero are equal :{All_zero == total_zero}")
    a = clean_dataset[clean_dataset.StartHesitation == 1].shape[0]
    print(f"The number of Class Start Hesitation :{a}")
    b = clean_dataset[clean_dataset.Walking == 1].shape[0]
    print(f"The number of Class Walking :{b}")
    c = clean_dataset[clean_dataset.Turn == 1].shape[0]
    print(f"The number of Class Turn : {c}")
    print(f"Is the toatl number of sample equal to all Four class combine :"
         f"{clean_dataset.shape[0] == a + b + c + All_zero}")
    
    feature_col = ['AccV','AccML','AccAP']
    label_col = ['StartHesitation','Turn','Walking', 'All_zero']

    feature_dataset = clean_dataset[feature_col]
    label_dataset = clean_dataset[label_col]
    print(f"The Feature :{feature_dataset.shape}, \n"
          f"The label {label_dataset.shape}")
    
    print(f"Because of Four Classes are imbalanced. To get high accuracy, oversampling is used")
    from imblearn.over_sampling import SMOTE
    import numpy as np
    # Instantiate the MultiLabelUnderSampler
    over_sampler = SMOTE()

    # Undersample the dataset
    X_resampled, y_resampled = over_sampler.fit_resample(feature_dataset.to_numpy(), label_dataset.to_numpy())
    
    SMOTE_features_dataset = pd.DataFrame(X_resampled, columns=feature_dataset.columns)
    
    
    SMOTE_labels_dataset = pd.DataFrame(y_resampled, columns=label_dataset.columns)
    print(f"The over sampling label shape : {SMOTE_labels_dataset.shape}")
    
    def check_all_four_class_condition(df):
        print(f"Check all four check condiiton in {df}")
        a = df[df.StartHesitation == 1].shape[0]
        b = df[df.Turn == 1].shape[0]
        c = df[df.Walking ==1].shape[0]
        d = df[df.All_zero == 1].shape[0]
        print(f"Number of Start Hesitation : {a}, \n"
              f"Number of Turn : {b}, \n"  
              f"Number of Walking : {c}, \n"
              f"Number of All_zero : {d}")
        print("Is Number of All four class is equal to total sampling :",
             df.shape[0] == a + b + c + d)
        
    check_all_four_class_condition(SMOTE_labels_dataset)
    
    oversampling_dataset = pd.concat([SMOTE_features_dataset,SMOTE_labels_dataset], 
                                     ignore_index= False, sort=False, axis=1)
    print(f"The shape of oversampling dataset is :{oversampling_dataset.shape[0]}")
    print(f"The number of duplication in dataset{oversampling_dataset.duplicated().sum()}")
    # Drop duplication
    oversampling_dataset.drop_duplicates(inplace=True)
    print(f"The shape of oversampling after remove duplication :{oversampling_dataset.shape}")
    
    # 60% Train Data, 20% Validation Data, 20% Test Data
    # 80% Set Data(60% rain Data, 20% Validation Data) , 20% Test Data
    
    from sklearn.model_selection import train_test_split
    import random 
    random_seed = 54

    set_data, test_data = train_test_split(oversampling_dataset, test_size=0.2, random_state=True)
    print(f"The set data shape : {set_data.shape}\n"
          f"The test data shape : {test_data.shape}\n"
          f"Is the dataset still in range : "
          f"{oversampling_dataset.shape[0] == set_data.shape[0] + test_data.shape[0]}")
    
    print(f"Again Search for duplicaiton : \n "
          f"Set Data :{set_data.duplicated().sum()} \n"
          f"Test Data :{test_data.duplicated().sum()}")
    
    check_all_four_class_condition(set_data)
    check_all_four_class_condition(test_data)
    print("All task are finish")
    
    return set_data,test_data

In [3]:
def preprocessing_the_dataset(df):
#     def check_skewness(df):
#     # this can check relation between each column
#         skew_limit=0.75
#         skew_value=df[df.columns].skew()
#         #print(skew_value)
#         skew_col=skew_value[abs(skew_value)>skew_limit]
#         cols=skew_col.index
#         return cols

    import random 
    random_seed = 54
    
    feature_col = ['AccV','AccML','AccAP']
    label_col = ['StartHesitation','Turn','Walking', 'All_zero']

    # make feature and label
    feature_dataset = df[feature_col]
    label_dataset = df[label_col]
    
#     # check skewness and powertransform
#     skew_columns = check_skewness(feature_dataset)
#     print(skew_columns)
    
#     print("Power Transform start")
#     from sklearn.preprocessing import PowerTransformer
#     pt=PowerTransformer(standardize=False)  
#     feature_dataset[skew_columns] = pt.fit_transform(feature_dataset[skew_columns])
    
#     print("Standardization start")
#     # Change features data to 0 and 1
#     from sklearn.preprocessing import StandardScaler
#     sc=StandardScaler()
#     feature_dataset=sc.fit_transform(feature_dataset)
    
    print("Train test split begin")
    from sklearn.model_selection import train_test_split
    train_feature, valid_feature, train_label, valid_label = train_test_split(feature_dataset, label_dataset, test_size=0.2, random_state=True)
    
    train_feature = np.array(train_feature) 
    valid_feature = np.array(valid_feature)
    train_label  = np.array(train_label)
    valid_label = np.array(valid_label)
    print("All task are finish")
    
    return train_feature, valid_feature, train_label, valid_label
    

#### Import Defog Dataset

In [4]:
# specify the folder path
defog_path = "Data/train/defog"
set_defog_dataset, test_defog_dataset = preprocess_the_whole_stage(defog_path)
train_defog_feature, valid_defog_feature, train_defog_label, valid_defog_label = preprocessing_the_dataset(set_defog_dataset)
print(f"{train_defog_feature.shape} , {train_defog_label.shape} , {valid_defog_feature.shape} , {valid_defog_label.shape}, {test_defog_dataset.shape}")

           ID  Time      AccV     AccML     AccAP  StartHesitation  Turn  \
0  be9d33541d     0 -1.002697  0.022371  0.068304                0     0   
1  be9d33541d     1 -1.002641  0.019173  0.066162                0     0   
2  be9d33541d     2 -0.999820  0.019142  0.067536                0     0   
3  be9d33541d     3 -0.998023  0.018378  0.068409                0     0   
4  be9d33541d     4 -0.998359  0.016726  0.066448                0     0   

   Walking  Valid   Task  
0        0  False  False  
1        0  False  False  
2        0  False  False  
3        0  False  False  
4        0  False  False  
           ID  Time      AccV     AccML     AccAP  StartHesitation  Turn  \
0  be9d33541d     0 -1.002697  0.022371  0.068304                0     0   
1  be9d33541d     1 -1.002641  0.019173  0.066162                0     0   
2  be9d33541d     2 -0.999820  0.019142  0.067536                0     0   
3  be9d33541d     3 -0.998023  0.018378  0.068409                0     0   
4



The over sampling label shape : (45510112, 4)
Check all four check condiiton in           StartHesitation  Turn  Walking  All_zero
0                       0     0        0         1
1                       0     0        0         1
2                       0     0        0         1
3                       0     0        0         1
4                       0     0        0         1
...                   ...   ...      ...       ...
45510107                0     0        1         0
45510108                0     0        1         0
45510109                0     0        1         0
45510110                0     0        1         0
45510111                0     0        1         0

[45510112 rows x 4 columns]
Number of Start Hesitation : 11377528, 
Number of Turn : 11377528, 
Number of Walking : 11377528, 
Number of All_zero : 11377528
Is Number of All four class is equal to total sampling : True
The shape of oversampling dataset is :45510112
The number of duplication in dataset14
Th

#### Import Tdcsfog Dataset

In [5]:
tdcsfog_path = "Data/train/tdcsfog"
set_tdcsfog_dataset, test_tdcsfog_dataset = preprocess_the_whole_stage(tdcsfog_path)
train_tdcsfog_feature, valid_tdcsfog_feature, train_tdcsfog_label, valid_tdcsfog_label = preprocessing_the_dataset(set_tdcsfog_dataset)
print(f"{train_tdcsfog_feature.shape} , {train_tdcsfog_label.shape} , {valid_tdcsfog_feature.shape} , {valid_tdcsfog_label.shape} , {test_tdcsfog_dataset.shape}")

           ID  Time      AccV     AccML     AccAP  StartHesitation  Turn  \
0  a171e61840     0 -9.665890  0.042550  0.184744                0     0   
1  a171e61840     1 -9.672969  0.049217  0.184644                0     0   
2  a171e61840     2 -9.670260  0.033620  0.193790                0     0   
3  a171e61840     3 -9.673356  0.035159  0.184369                0     0   
4  a171e61840     4 -9.671458  0.043913  0.197814                0     0   

   Walking  
0        0  
1        0  
2        0  
3        0  
4        0  
           ID  Time      AccV     AccML     AccAP  StartHesitation  Turn  \
0  a171e61840     0 -9.665890  0.042550  0.184744                0     0   
1  a171e61840     1 -9.672969  0.049217  0.184644                0     0   
2  a171e61840     2 -9.670260  0.033620  0.193790                0     0   
3  a171e61840     3 -9.673356  0.035159  0.184369                0     0   
4  a171e61840     4 -9.671458  0.043913  0.197814                0     0   

   Walki

#### Build Model

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report

def eval_metrics(classifier, test_features, test_labels):
    
    # make prediction
    predictions   = classifier.predict(test_features)
    
    base_score   = classifier.score(test_features, test_labels)
    accuracy = accuracy_score(test_labels, predictions)
    av_precision = average_precision_score(test_labels, predictions)
    
    target_names = ['StartHesitation','Turn','Walking', 'All_zero']
    print("Classification report")
    print("---------------------","\n")
    print(classification_report(test_labels, predictions, target_names=target_names),"\n")

    print("Accuracy Measures")
    print("---------------------","\n")
    print("Base score: ", base_score)
    print("Accuracy: ", accuracy)
    print("Avarge Precision: ", av_precision)
    
    return base_score,accuracy,av_precision

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

defog_Model = { 
    #     "Decision Tree": DecisionTreeClassifier(),      
    "KNearest": KNeighborsClassifier(n_jobs=-1),           
}

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
tdcsfog_Models = {                         
#     "Decision Tree": DecisionTreeClassifier(),      
    "KNearest": KNeighborsClassifier(n_jobs=-1),           
#     "Ridge" : RidgeClassifier(),              #poor result    
#     "MLP" : MLPClassifier(),                  #poor result              
#     "R_Neighour" : RadiusNeighborsClassifier(),
#     "Extra_T" : ExtraTreesClassifier(n_estimators=40),
#     "R_forest" : RandomForestClassifier(max_depth=20),
#     "XGB" : xgb.XGBClassifier(),
#     "Catboost" : CatBoostClassifier()
    }

In [9]:
from sklearn.model_selection import cross_val_score
counter = 1
for Model_Name, classifier in defog_Model.items(): 
    # with mlflow.start_run(nested=True):
    print(f"{counter}. {Model_Name}")

    # fit the model
    from joblib import parallel_backend
    with parallel_backend('threading', n_jobs=-1):
        classifier.fit(train_defog_feature, train_defog_label)


    counter = counter + 1
    # Calculate the metrics
    base_score,accuracy,av_precision = eval_metrics(classifier,
                                                    valid_defog_feature,
                                                    valid_defog_label)  

    print("________________________________________")

1. KNearest
Classification report
--------------------- 

                 precision    recall  f1-score   support

StartHesitation       1.00      1.00      1.00   1819938
           Turn       0.85      0.94      0.89   1821984
        Walking       0.90      0.98      0.94   1819133
       All_zero       0.95      0.75      0.84   1820561

      micro avg       0.92      0.92      0.92   7281616
      macro avg       0.92      0.92      0.92   7281616
   weighted avg       0.92      0.92      0.92   7281616
    samples avg       0.92      0.92      0.92   7281616
 

Accuracy Measures
--------------------- 

Base score:  0.9175336903236864
Accuracy:  0.9175336903236864
Avarge Precision:  0.8684360009187366
________________________________________


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
counter = 1
for Model_Name, classifier in tdcsfog_Models.items(): 
    # with mlflow.start_run(nested=True):
    print(f"{counter}. {Model_Name}")

    # fit the model
    from joblib import parallel_backend
    with parallel_backend('threading', n_jobs=-1):
        classifier.fit(train_tdcsfog_feature, train_tdcsfog_label)
    counter = counter + 1
    # Calculate the metrics
    base_score, accuracy, av_precision = eval_metrics(classifier,
                                                    valid_tdcsfog_feature,
                                                    valid_tdcsfog_label)  

    print("________________________________________")

1. KNearest
Classification report
--------------------- 

                 precision    recall  f1-score   support

StartHesitation       0.80      0.88      0.84    777443
           Turn       0.71      0.66      0.68    777904
        Walking       0.81      0.89      0.85    778717
       All_zero       0.80      0.55      0.65    779597

      micro avg       0.78      0.74      0.76   3113661
      macro avg       0.78      0.74      0.76   3113661
   weighted avg       0.78      0.74      0.76   3113661
    samples avg       0.74      0.74      0.74   3113661
 

Accuracy Measures
--------------------- 

Base score:  0.7431695357972496
Accuracy:  0.7431695357972496
Avarge Precision:  0.6486907358165517
________________________________________


  _warn_prf(average, modifier, msg_start, len(result))


#### Final test

In [11]:
import os 
import pandas as pd

def import_test_file_from_folder(file_path):
    test_file = pd.read_csv(file_path)
    name = os.path.basename(file_path)
    id_value = name.split('.')[0]
    test_file['Id_value'] = id_value
    test_file['Id'] = test_file['Id_value'].astype(str) + '_' + test_file['Time'].astype(str)
    test_file = test_file[['Id','AccV','AccML','AccAP']]
    return test_file

def preprocessing_test_dataset(df):
    import random 
    random_seed = 54
    
    feature_col = ['AccV','AccML','AccAP']
    feature_dataset = df[feature_col]

    feature_dataset=np.array(feature_dataset)
    return feature_dataset

def make_prediction(Models, test_submit):
    for Model_Name, classifier in Models.items(): 
        test_submit_pred = classifier.predict(test_submit)
    print(f"The prediciton is {test_submit_pred.shape}")
    return test_submit_pred

def test_submission_file(test_file, predicit_score_np):
    test_score_df = pd.DataFrame(predicit_score_np, columns=['StartHesitation', 'Turn', 'Walking', 'All_zero'])
    df = pd.concat([test_file, test_score_df], ignore_index= False, sort=False, axis=1)
    select_column = ['Id','StartHesitation', 'Turn', 'Walking']
    submit_dataset = df[select_column]
    return submit_dataset


In [12]:
test_file_path = ('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/')
test_defog_path = test_file_path + 'defog/02ab235146.csv'
test_tdcsfog_path = test_file_path + 'tdcsfog/003f117e14.csv'

In [13]:
test_defog = import_test_file_from_folder(test_defog_path)
test_defog_submit = preprocessing_test_dataset(test_defog)
test_defog_submit_pred = make_prediction(defog_Model, test_defog_submit)
test_defog_submission_file = test_submission_file(test_defog, test_defog_submit_pred)
print(test_defog_submission_file.head())

The prediciton is (281688, 4)
             Id  StartHesitation  Turn  Walking
0  02ab235146_0                0     0        0
1  02ab235146_1                0     0        0
2  02ab235146_2                0     0        0
3  02ab235146_3                0     0        0
4  02ab235146_4                0     0        0


In [14]:
test_tdcsfog = import_test_file_from_folder(test_tdcsfog_path)
test_tdcsfog_submit = preprocessing_test_dataset(test_tdcsfog)
test_tdcsfog_submit_pred = make_prediction(tdcsfog_Models, test_tdcsfog_submit)
test_tdcsfog_submission_file = test_submission_file(test_tdcsfog, test_tdcsfog_submit_pred)
print(test_tdcsfog_submission_file.head())

The prediciton is (4682, 4)
             Id  StartHesitation  Turn  Walking
0  003f117e14_0                0     0        0
1  003f117e14_1                0     0        0
2  003f117e14_2                0     0        0
3  003f117e14_3                0     0        0
4  003f117e14_4                0     0        0


In [15]:
submission_df = pd.concat([test_tdcsfog_submission_file, test_defog_submission_file])
submission_df

Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,0,0,0
1,003f117e14_1,0,0,0
2,003f117e14_2,0,0,0
3,003f117e14_3,0,0,0
4,003f117e14_4,0,0,0
...,...,...,...,...
281683,02ab235146_281683,0,0,0
281684,02ab235146_281684,0,0,0
281685,02ab235146_281685,0,0,0
281686,02ab235146_281686,0,0,0


In [16]:
submission_df.shape, test_tdcsfog_submission_file.shape, test_defog_submission_file.shape,

((286370, 4), (4682, 4), (281688, 4))

In [17]:
submission_df.shape[0] == test_tdcsfog_submission_file.shape[0] + test_defog_submission_file.shape[0]

True

In [18]:
submission_sample = pd.read_csv('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv')
submission_sample.shape == submission_df.shape

True

In [19]:
submission_df.to_csv('submission.csv')