In [3]:
# libraries importation


import pandas as pd
import numpy as np
import os
import sys
import csv
import joblib
#import arff

In [2]:
def remove_rare_classes(df, target_column, min_count=10):
    class_counts = df[target_column].value_counts()
    rare_classes = class_counts[class_counts < min_count].index
    print(f"[INFO] Classes supprimées (trop rares) : {list(rare_classes)}")

    return df[~df[target_column].isin(rare_classes)].reset_index(drop=True)


# Adult Dataset

In [3]:
# Specify the file path
adult_train_path = "./datasets/private/multiclass datasets/adult/adult.data"
adult_test_path = "./datasets/private/multiclass datasets/adult/adult.test"

adult_all_path_out = "./datasets/private/multiclass datasets/adult/all.csv"

In [4]:
# Read a portion of the file to analyze the separator
num_lines_to_read = 10

# Detect the delimiter
with open(adult_train_path, 'r') as file:
    sample = file.read(num_lines_to_read)
    dialect = csv.Sniffer().sniff(sample)
    delimiter = dialect.delimiter
    
# Print the detected delimiter
print(f"Detected delimiter ({delimiter})")

Detected delimiter (,)


In [5]:
# Specify the list of header names
adult_names = [
    "age", 
    "workclass", 
    "fnlwgt", 
    "education", 
    "education-num", 
    "marital-status", 
    "occupation", 
    "relationship", 
    "race", 
    "sex", 
    "capital-gain", 
    "capital-loss", 
    "hours-per-week", 
    "native-country",
    "class"
]


In [6]:
# Read the .dat file using pandas and set the header names
df_adult_train = pd.read_csv(adult_train_path, delimiter=delimiter, names=adult_names, index_col=False)
df_adult_test = pd.read_csv(adult_test_path, delimiter=delimiter, names=adult_names, index_col=False)


In [7]:
# Concatenate the two datasets
merged_df = pd.concat([df_adult_train, df_adult_test], ignore_index=True)
# Remove trailing '.' and trim whitespace
merged_df['class'] = merged_df['class'].str.strip().str.rstrip('.')

# Show unique values
print(merged_df['class'].unique())
print(df_adult_train['class'].unique())
print(df_adult_test['class'].unique())


['<=50K' '>50K' nan]
[' <=50K' ' >50K']
[nan ' <=50K.' ' >50K.']


In [8]:
merged_df.describe()

Unnamed: 0,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,189664.1,10.078089,1079.067626,87.502314,40.422382
std,105604.0,2.570973,7452.019058,403.004552,12.391444
min,12285.0,1.0,0.0,0.0,1.0
25%,117550.5,9.0,0.0,0.0,40.0
50%,178144.5,10.0,0.0,0.0,40.0
75%,237642.0,12.0,0.0,0.0,45.0
max,1490400.0,16.0,99999.0,4356.0,99.0


In [9]:
merged_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48838,39,Private,215419.0,Bachelors,13.0,Divorced,Prof-specialty,Not-in-family,White,Female,0.0,0.0,36.0,United-States,<=50K
48839,64,?,321403.0,HS-grad,9.0,Widowed,?,Other-relative,Black,Male,0.0,0.0,40.0,United-States,<=50K
48840,38,Private,374983.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
48841,44,Private,83891.0,Bachelors,13.0,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455.0,0.0,40.0,United-States,<=50K


In [None]:
# target_column = "Class"
stat = merged_df["class"].value_counts()
print(stat)
merged_df = remove_rare_classes(merged_df, "class", min_count=5)
stat = df_nursery_train["class"].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the encoder
le = LabelEncoder()

# Fit and transform the class column
merged_df['class'] = le.fit_transform(merged_df['class'])

In [None]:
merged_df

In [None]:
merged_df.info()

In [None]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)


In [None]:
merged_df.to_csv(adult_all_path_out, index=False)

In [None]:
joblib.dump({
    "class-encoding": label_mapping
}, adult_all_path_out[:-4]+".conf")

# audiology+standardized Dataset

In [None]:
# Specify the file path
audiology_train_path = "./datasets/private/multiclass datasets/audiology+standardized/audiology.standardized.data"
audiology_test_path = "./datasets/private/multiclass datasets/audiology+standardized/audiology.standardized.test"

audiology_all_path_out = "./datasets/private/multiclass datasets/audiology+standardized/all.csv"

In [None]:
# Read a portion of the file to analyze the separator
num_lines_to_read = 10

# Detect the delimiter
with open(audiology_train_path, 'r') as file:
    sample = file.read(num_lines_to_read)
    dialect = csv.Sniffer().sniff(sample)
    delimiter = dialect.delimiter
    
# Print the detected delimiter
print(f"Detected delimiter ({delimiter})")

In [None]:
# Specify the list of header names
audiology_names = [
    "age_gt_60",
    "air()",
    "airBoneGap",
    "ar_c()",
    "ar_u()",
    "bone()",
    "boneAbnormal",
    "bser()",
    "history_buzzing",
    "history_dizziness",
    "history_fluctuating",
    "history_fullness",
    "history_heredity",
    "history_nausea",
    "history_noise",
    "history_recruitment",
    "history_ringing",
    "history_roaring",
    "history_vomiting",
    "late_wave_poor",
    "m_at_2k",
    "m_cond_lt_1k",
    "m_gt_1k",
    "m_m_gt_2k",
    "m_m_sn",
    "m_m_sn_gt_1k",
    "m_m_sn_gt_2k",
    "m_m_sn_gt_500",
    "m_p_sn_gt_2k",
    "m_s_gt_500",
    "m_s_sn",
    "m_s_sn_gt_1k",
    "m_s_sn_gt_2k",
    "m_s_sn_gt_3k",
    "m_s_sn_gt_4k",
    "m_sn_2_3k",
    "m_sn_gt_1k",
    "m_sn_gt_2k",
    "m_sn_gt_3k",
    "m_sn_gt_4k",
    "m_sn_gt_500",
    "m_sn_gt_6k",
    "m_sn_lt_1k",
    "m_sn_lt_2k",
    "m_sn_lt_3k",
    "middle_wave_poor",
    "mod_gt_4k",
    "mod_mixed",
    "mod_s_mixed",
    "mod_s_sn_gt_500",
    "mod_sn",
    "mod_sn_gt_1k",
    "mod_sn_gt_2k",
    "mod_sn_gt_3k",
    "mod_sn_gt_4k",
    "mod_sn_gt_500",
    "notch_4k",
    "notch_at_4k",
    "o_ar_c()",
    "o_ar_u()",
    "s_sn_gt_1k",
    "s_sn_gt_2k",
    "s_sn_gt_4k",
    "speech()",
    "static_normal",
    "tymp()",
    "viith_nerve_signs",
    "wave_V_delayed",
    "waveform_ItoV_prolonged"
] + ["identifier", "class"]



In [None]:
# Read the .dat file using pandas and set the header names
df_audiology_train = pd.read_csv(audiology_train_path, delimiter=delimiter, names=audiology_names, index_col=False, na_values='?')
df_audiology_test = pd.read_csv(audiology_test_path, delimiter=delimiter, names=audiology_names, index_col=False, na_values='?')


In [None]:
# Concatenate the two datasets
merged_df = pd.concat([df_audiology_train, df_audiology_test], ignore_index=True)
# Remove trailing '.' and trim whitespace
merged_df['class'] = merged_df['class'].str.strip().str.rstrip('.')

# Show unique values
print(merged_df['class'].unique())
print(df_audiology_train['class'].unique())
print(df_audiology_test['class'].unique())


In [None]:
merged_df.describe()

In [None]:
merged_df.info()

In [None]:
merged_df

In [None]:
# target_column = "Class"
stat = merged_df["class"].value_counts()
print(stat)
merged_df = remove_rare_classes(merged_df, "class", min_count=5)
stat = df_nursery_train["class"].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the encoder
le = LabelEncoder()

# Fit and transform the class column
merged_df['class'] = le.fit_transform(merged_df['class'])

In [None]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)


In [None]:
merged_df

In [None]:
merged_df.to_csv(audiology_all_path_out, index=False)

In [None]:
joblib.dump({
    "class-encoding": label_mapping
}, audiology_all_path_out[:-4]+".conf")

# car+evaluation Dataset

In [None]:
# Specify the file path
car_train_path = "./datasets/private/multiclass datasets/car+evaluation/car.data"

car_all_path_out = "./datasets/private/multiclass datasets/car+evaluation/all.csv"

In [None]:
# Read a portion of the file to analyze the separator
num_lines_to_read = 10

# Detect the delimiter
with open(car_train_path, 'r') as file:
    sample = file.read(num_lines_to_read)
    dialect = csv.Sniffer().sniff(sample)
    delimiter = dialect.delimiter
    
# Print the detected delimiter
print(f"Detected delimiter ({delimiter})")

In [None]:
car_names = [
    "buying",
    "maint",
    "doors",
    "persons",
    "lug_boot",
    "safety"
] + [ "class"]

In [None]:
# Read the .dat file using pandas and set the header names
df_car_train = pd.read_csv(car_train_path, delimiter=delimiter, names=car_names, index_col=False)


In [None]:
df_car_train.describe()

In [None]:
df_car_train.info()

In [None]:
df_car_train

In [None]:
# target_column = "Class"
stat = df_car_train["class"].value_counts()
print(stat)
df_car_train = remove_rare_classes(df_car_train, "class", min_count=5)
print(df_car_train["class"].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the encoder
le = LabelEncoder()

# Fit and transform the class column
df_car_train['class'] = le.fit_transform(df_car_train['class'])

In [None]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)


In [None]:
df_car_train

In [None]:
df_car_train.to_csv(car_all_path_out, index=False)

In [None]:
joblib.dump({
    "class-encoding": label_mapping
}, car_all_path_out[:-4]+".conf")

# diabetes+130-us+hospitals+for+years+1999-2008

In [5]:
# Specify the file path
diab130_train_path = "./datasets/private/multiclass datasets/diabetes+130-us+hospitals+for+years+1999-2008/diabetic_data.csv"
diab130IDS_train_path = "./datasets/private/multiclass datasets/diabetes+130-us+hospitals+for+years+1999-2008/diabetic_data.csv"

In [6]:
# Read the .dat file using pandas and set the header names
df_diab130_train = pd.read_csv(diab130_train_path, delimiter=',', index_col=False, na_values='?')


  df_diab130_train = pd.read_csv(diab130_train_path, delimiter=',', index_col=False, na_values='?')


In [7]:
df_diab130_train

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [8]:
df_diab130_train.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

# mushroom

In [None]:
# Specify the file path
mushroom_train_path = "./datasets/private/multiclass datasets/mushroom/agaricus-lepiota.data"


mushroom_train_path_out = "./datasets/private/multiclass datasets/mushroom/all.csv"

In [None]:
# Read a portion of the file to analyze the separator
num_lines_to_read = 10

# Detect the delimiter
with open(mushroom_train_path, 'r') as file:
    sample = file.read(num_lines_to_read)
    dialect = csv.Sniffer().sniff(sample)
    delimiter = dialect.delimiter
    
# Print the detected delimiter
print(f"Detected delimiter ({delimiter})")

In [None]:
mushroom_names = ["class"] + [
    "cap-shape",
    "cap-surface",
    "cap-color",
    "bruises",
    "odor",
    "gill-attachment",
    "gill-spacing",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "stalk-surface-above-ring",
    "stalk-surface-below-ring",
    "stalk-color-above-ring",
    "stalk-color-below-ring",
    "veil-type",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "population",
    "habitat"
] 

In [None]:
# Read the .data file using pandas and set the header names
df_mushroom_train = pd.read_csv(mushroom_train_path, delimiter=delimiter, names=mushroom_names, index_col=False)


In [None]:
df_mushroom_train.describe()

In [None]:
df_mushroom_train.info()

In [None]:
df_mushroom_train

In [None]:
# target_column = "Class"
stat = df_mushroom_train["class"].value_counts()
print(stat)
df_mushroom_train = remove_rare_classes(df_mushroom_train, "class", min_count=5)
print(df_mushroom_train["class"].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the encoder
le = LabelEncoder()

# Fit and transform the class column
df_mushroom_train['class'] = le.fit_transform(df_mushroom_train['class'])

In [None]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)


In [None]:
df_mushroom_train.to_csv(mushroom_train_path_out, index=False)

In [None]:
joblib.dump({
    "class-encoding": label_mapping
}, mushroom_train_path_out[:-4]+".conf")

# nursery

In [None]:
# Specify the file path
nursery_train_path = "./datasets/private/multiclass datasets/nursery/nursery.data"


nursery_train_path_out = "./datasets/private/multiclass datasets/nursery/all.csv"

In [None]:
# Read a portion of the file to analyze the separator
num_lines_to_read = 10

# Detect the delimiter
with open(nursery_train_path, 'r') as file:
    sample = file.read(num_lines_to_read)
    dialect = csv.Sniffer().sniff(sample)
    delimiter = dialect.delimiter
    
# Print the detected delimiter
print(f"Detected delimiter ({delimiter})")

In [None]:
nursery_names = [
    "parents",
    "has_nurs",
    "form",
    "children",
    "housing",
    "finance",
    "social",
    "health"
] +  ["class"]

In [None]:
# Read the .data file using pandas and set the header names
df_nursery_train = pd.read_csv(nursery_train_path, delimiter=delimiter, names=nursery_names, index_col=False)


In [None]:
df_nursery_train.describe()

In [None]:
df_nursery_train.info()

In [None]:
df_nursery_train

In [None]:
# target_column = "Class"
stat = df_nursery_train["class"].value_counts()
print(stat)
df_nursery_train = remove_rare_classes(df_nursery_train, "class", min_count=5)
print(df_nursery_train["class"].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the encoder
le = LabelEncoder()

# Fit and transform the class column
df_nursery_train['class'] = le.fit_transform(df_nursery_train['class'])

In [None]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)


In [None]:
df_nursery_train.to_csv(nursery_train_path_out, index=False)

In [None]:
joblib.dump({
    "class-encoding": label_mapping,
    "rare-class": stat
}, nursery_train_path_out[:-4]+".conf")

# student+performance Dataset

In [None]:
# Specify the file path
student_train_path = "./datasets/private/multiclass datasets/student+performance/student/student-por.csv"

student_train_path_out = "./datasets/private/multiclass datasets/student+performance/student/all.csv"

In [None]:
# Read the .dat file using pandas and set the header names
df_student_train = pd.read_csv(student_train_path, delimiter=';', index_col=False)

In [None]:
df_student_train.describe()

In [None]:
df_student_train.info()

In [None]:
df_student_train

In [None]:
df_student_train['G3_discrete'] = pd.cut(
    df_student_train['G3'],
    bins=[-1, 9, 14, 20],  # -1 au lieu de 0 pour inclure 0 dans le premier intervalle
    labels=[0, 1, 2],
    right=True
)

# Vérification de la distribution
print(df_student_train['G3_discrete'].value_counts())

In [None]:
df_student_train

In [None]:
df_student_train.drop(columns=['G3'], inplace=True)


In [None]:
# target_column = "Class"
stat = df_student_train["G3_discrete"].value_counts()
print(stat)
df_student_train = remove_rare_classes(df_student_train, "G3_discrete", min_count=5)
print(df_student_train["G3_discrete"].value_counts())

In [None]:
df_student_train.to_csv(student_train_path_out, index=False)

In [None]:
joblib.dump({
    "class-encoding": {
        "discretize": {
            "bins":[-1, 9, 14, 20],  # -1 au lieu de 0 pour inclure 0 dans le premier intervalle
            "labels":[0, 1, 2],
            "right":True
        }
    }
}, student_train_path_out[:-4]+".conf")