In [1]:
# import packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load data from CSV file
csv_file_path = './database/audiometric_test_database.csv'  # Update this with your actual CSV file path
df = pd.read_csv(csv_file_path)

In [3]:
# Ensure that the necessary columns exist
required_columns = ['Gender_(M=1/F=0)','Age_In_Years','045dB_20','045dB_30','045dB_40','045dB_50','045dB_12k','045dB_12.5K','045dB_13K','045dB_13.5K','045dB_14K','045dB_14.5K','045dB_15K','045dB_15.5K','045dB_16K','045dB_16.5K','045dB_17K','045dB_17.5K','045dB_18K','040dB_20','040dB_30','040dB_40','040dB_50','040dB_12k','040dB_12.5K','040dB_13K','040dB_13.5K','040dB_14K','040dB_14.5K','040dB_15K','040dB_15.5K','040dB_16K','040dB_16.5K','040dB_17K','040dB_17.5K','040dB_18K','035dB_20','035dB_30','035dB_40','035dB_50','035dB_12k','035dB_12.5K','035dB_13K','035dB_13.5K','035dB_14K','035dB_14.5K','035dB_15K','035dB_15.5K','035dB_16K','035dB_16.5K','035dB_17K','035dB_17.5K','035dB_18K','hearing_age']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

In [4]:
# Features and target
X = df[['Gender_(M=1/F=0)','Age_In_Years','045dB_20','045dB_30','045dB_40','045dB_50','045dB_12k','045dB_12.5K','045dB_13K','045dB_13.5K','045dB_14K','045dB_14.5K','045dB_15K','045dB_15.5K','045dB_16K','045dB_16.5K','045dB_17K','045dB_17.5K','045dB_18K','040dB_20','040dB_30','040dB_40','040dB_50','040dB_12k','040dB_12.5K','040dB_13K','040dB_13.5K','040dB_14K','040dB_14.5K','040dB_15K','040dB_15.5K','040dB_16K','040dB_16.5K','040dB_17K','040dB_17.5K','040dB_18K','035dB_20','035dB_30','035dB_40','035dB_50','035dB_12k','035dB_12.5K','035dB_13K','035dB_13.5K','035dB_14K','035dB_14.5K','035dB_15K','035dB_15.5K','035dB_16K','035dB_16.5K','035dB_17K','035dB_17.5K','035dB_18K']]
y = df['hearing_age']

In [5]:
# Preprocessing
# Specify categorical and numerical features
categorical_features = ['Gender_(M=1/F=0)', '045dB_20','045dB_30','045dB_40','045dB_50','045dB_12k','045dB_12.5K','045dB_13K','045dB_13.5K','045dB_14K','045dB_14.5K','045dB_15K','045dB_15.5K','045dB_16K','045dB_16.5K','045dB_17K','045dB_17.5K','045dB_18K','040dB_20','040dB_30','040dB_40','040dB_50','040dB_12k','040dB_12.5K','040dB_13K','040dB_13.5K','040dB_14K','040dB_14.5K','040dB_15K','040dB_15.5K','040dB_16K','040dB_16.5K','040dB_17K','040dB_17.5K','040dB_18K','035dB_20','035dB_30','035dB_40','035dB_50','035dB_12k','035dB_12.5K','035dB_13K','035dB_13.5K','035dB_14K','035dB_14.5K','035dB_15K','035dB_15.5K','035dB_16K','035dB_16.5K','035dB_17K','035dB_17.5K','035dB_18K']
numerical_features = ['Age_In_Years']

In [6]:
# Create transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [7]:
# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(objective='reg:squarederror'))
])

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train the model
pipeline.fit(X_train, y_train)

In [10]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [11]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [12]:
print(f'Mean Squared Error: {mse:.2f}')
print(f'R² Score: {r2:.2f}')

Mean Squared Error: 22.29
R² Score: 0.88


Mean Squared Error (MSE): 22.29: This value indicates the average squared difference between the predicted and actual hearing ages. While it's a numerical value, whether it's considered "good" depends on the specific context of your dataset and the range of the target variable.

R² Score: 0.88: An R² score of 0.88 suggests that your model explains 88% of the variance in the hearing age. This is quite good, as it indicates that your model has captured a substantial amount of the underlying patterns in the data.

In [14]:
import joblib
# Save the model to a file
# Access the XGBoost model and save it
model = pipeline.named_steps['model']
model.save_model('trained_model.json')  # Save as XGBoost model

# Alternatively, save the entire pipeline using joblib
joblib.dump(pipeline, 'trained_pipeline.pkl')

['trained_pipeline.pkl']

In [38]:
import pandas as pd

# Adjust this to match the expected number of features (52 in this case)
single_entry = pd.DataFrame([[1,16,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0]], 
                  columns=['Gender_(M=1/F=0)', 'Age_In_Years', '045dB_20', '045dB_30', '045dB_40', '045dB_50', '045dB_12k', '045dB_12.5K', '045dB_13K', '045dB_13.5K', '045dB_14K', 
                           '045dB_14.5K', '045dB_15K', '045dB_15.5K', '045dB_16K', '045dB_16.5K', '045dB_17K', '045dB_17.5K', '045dB_18K', 
                           '040dB_20', '040dB_30', '040dB_40', '040dB_50', '040dB_12k', '040dB_12.5K', '040dB_13K', '040dB_13.5K', 
                           '040dB_14K', '040dB_14.5K', '040dB_15K', '040dB_15.5K', '040dB_16K', '040dB_16.5K', '040dB_17K', 
                           '040dB_17.5K', '040dB_18K', '035dB_20', '035dB_30', '035dB_40', '035dB_50', '035dB_12k', 
                           '035dB_12.5K', '035dB_13K', '035dB_13.5K', '035dB_14K', '035dB_14.5K', '035dB_15K', 
                           '035dB_15.5K', '035dB_16K', '035dB_16.5K', '035dB_17K', '035dB_17.5K', '035dB_18K'])

# Make a prediction
prediction = pipeline.predict(single_entry)

# Output the result
print(f'Predicted class: {prediction[0]}')

Predicted class: 16.000558853149414
