In [63]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from math import sqrt
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor

In [64]:
def parse_tempo(val):
    """
    Converts tempo from string/object (like '[129.19921875]') to float.
    Returns np.nan if parsing fails.
    """
    val_str = str(val).strip("[]")
    try:
        return float(val_str)
    except:
        return np.nan

In [65]:
# 1. Load the dataset
data_path = "/content/drive/MyDrive/DSL_Winter_Project_2025/development_with_core_features.csv"
data = pd.read_csv(data_path)
print("Initial shape of data:", data.shape)

# 2. Parse 'tempo' so it becomes numeric
data["tempo"] = data["tempo"].apply(parse_tempo)

# 3. Identify all numeric columns
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)

Initial shape of data: (2933, 38)
Numeric columns: ['Id', 'sampling_rate', 'age', 'mean_pitch', 'max_pitch', 'min_pitch', 'jitter', 'shimmer', 'energy', 'zcr_mean', 'spectral_centroid_mean', 'tempo', 'hnr', 'num_words', 'num_characters', 'num_pauses', 'silence_duration', 'pitch_mean', 'pitch_std', 'pitch_iqr', 'f1_mean', 'f2_mean', 'f3_mean', 'f2_f1_ratio', 'hnr_mean', 'jitter.1', 'shimmer.1', 'speech_rate', 'pause_duration_std', 'spectral_flux', 'rolloff_25', 'rolloff_75', 'mfcc1_mean', 'mfcc1_std', 'mfcc2_mean']


In [66]:
df_numeric = data[numeric_cols].copy()  # Subset with only numeric columns

In [67]:
# Correlation with the target using Spearman
corr_spearman = df_numeric.corr(method="spearman")["age"].drop("age")
# Correlation with the target using Pearson
corr_pearson = df_numeric.corr(method="pearson")["age"].drop("age")

# Combine into a single DataFrame
df_corr = pd.DataFrame({
    "spearman": corr_spearman,
    "pearson": corr_pearson
})
df_corr["abs_spearman"] = df_corr["spearman"].abs()
df_corr["abs_pearson"] = df_corr["pearson"].abs()

df_corr.sort_values("abs_spearman", ascending=False, inplace=True)

print("Combined correlation (Spearman & Pearson) to 'age':\n", df_corr)

Combined correlation (Spearman & Pearson) to 'age':
                         spearman   pearson  abs_spearman  abs_pearson
silence_duration        0.587651  0.514127      0.587651     0.514127
num_words               0.555482  0.473499      0.555482     0.473499
num_characters          0.555298  0.473403      0.555298     0.473403
hnr                    -0.532629 -0.446880      0.532629     0.446880
num_pauses              0.532293  0.437670      0.532293     0.437670
max_pitch               0.469464  0.226643      0.469464     0.226643
min_pitch              -0.464523 -0.224201      0.464523     0.224201
jitter.1               -0.459493 -0.369446      0.459493     0.369446
zcr_mean                0.367529  0.278430      0.367529     0.278430
mean_pitch              0.366192  0.315645      0.366192     0.315645
pitch_mean              0.362789  0.316161      0.362789     0.316161
jitter                  0.349202  0.238498      0.349202     0.238498
pitch_iqr               0.331967  0.3

In [68]:
threshold = 0.1  # correlation threshold

# Condition to keep a feature:
#    (abs_spearman >= threshold) OR (abs_pearson >= threshold)
# We want to DROP if both are below threshold, i.e.
#    (abs_spearman < 0.1) AND (abs_pearson < 0.1)
# We'll invert the condition to pick the features we keep
keep_condition = (
    (df_corr["abs_spearman"] >= threshold) |
    (df_corr["abs_pearson"] >= threshold)
)

features_to_keep = df_corr[keep_condition].index.tolist()
print("Features kept after correlation filtering:\n", features_to_keep)

# Now remove features not in keep list from your data (excluding 'age')
X = df_numeric.drop(columns=["age"])  # if you have separate features
X_filtered = X[features_to_keep]
print("Before filtering:", X.shape)
print("After filtering:", X_filtered.shape)

Features kept after correlation filtering:
 ['silence_duration', 'num_words', 'num_characters', 'hnr', 'num_pauses', 'max_pitch', 'min_pitch', 'jitter.1', 'zcr_mean', 'mean_pitch', 'pitch_mean', 'jitter', 'pitch_iqr', 'pitch_std', 'pause_duration_std', 'shimmer', 'mfcc2_mean', 'rolloff_25', 'f3_mean', 'spectral_flux', 'energy', 'rolloff_75', 'f2_mean', 'shimmer.1', 'hnr_mean', 'speech_rate']
Before filtering: (2933, 34)
After filtering: (2933, 26)


In [69]:
def remove_high_corr_columns_across_methods(X, threshold=0.8, target_corr_df=None):
    """
    Remove columns from DataFrame X that are highly correlated
    (above 'threshold') in ANY of the two correlation measures:
    Pearson, Spearman.

    We handle columns pairwise. If correlation is > threshold
    for a given pair, we drop one of them (the one with lower
    correlation to the target).

    Parameters
    ----------
    X : pd.DataFrame
        The DataFrame with only the columns to be processed.
    threshold : float, default=0.8
        Correlation threshold above which columns are considered
        highly correlated.
    target_corr_df : pd.DataFrame or None, default=None
        If provided, should contain each column's correlation
        to the target (e.g. 'abs_spearman', 'abs_pearson').
        We'll use it to decide which column to remove
        when a pair is above threshold.
        If None, we remove the second column by default.

    Returns
    -------
    X_filtered : pd.DataFrame
        The DataFrame with columns removed.
    features_to_remove : set
        The set of columns that were removed.
    """

    # 1. Compute correlation matrices for the 3 methods
    corr_pearson = X.corr(method='pearson').abs()
    corr_spearman = X.corr(method='spearman').abs()

    # 2. Combine them by taking the maximum correlation
    #    across pearson, spearman, kendall for each pair
    max_corr = np.minimum.reduce([corr_pearson.values,
                                  corr_spearman.values])

    # Convert back to a DataFrame for convenience
    max_corr_df = pd.DataFrame(max_corr,
                               index=X.columns,
                               columns=X.columns)

    features_to_remove = set()
    columns = list(X.columns)

    # 3. Identify pairs with correlation above threshold
    for i in range(len(columns)):
        for j in range(i+1, len(columns)):
            if max_corr_df.iloc[i, j] > threshold:
                colname1 = columns[i]
                colname2 = columns[j]

                # Skip if either column is already removed
                if colname1 in features_to_remove or colname2 in features_to_remove:
                    continue

                # Decide which column to remove
                if target_corr_df is not None:
                    # We have correlation to the target; remove the column with lower correlation
                    col1_abs = target_corr_df.loc[colname1] if colname1 in target_corr_df.index else 0
                    col2_abs = target_corr_df.loc[colname2] if colname2 in target_corr_df.index else 0
                    if col1_abs < col2_abs:
                        features_to_remove.add(colname1)
                    else:
                        features_to_remove.add(colname2)
                else:
                    # If no target info, remove colname2 by default
                    features_to_remove.add(colname2)

    # 4. Drop the identified columns
    X_filtered = X.drop(columns=features_to_remove, errors='ignore')

    return X_filtered, features_to_remove

In [70]:
# Unpack the results
X_final, features_to_remove = remove_high_corr_columns_across_methods(
    X_filtered,
    threshold=0.8,
    target_corr_df=df_numeric["age"]
)

# Print only the columns that were removed
print("Columns removed due to high correlation:")
print(features_to_remove)

Columns removed due to high correlation:
{'pitch_iqr', 'num_words', 'rolloff_25', 'hnr_mean', 'pitch_mean', 'num_characters', 'mean_pitch'}


In [72]:
# 1. Identify top 1 most frequent ethnicities
top_1 = data['ethnicity'].value_counts().nlargest(1).index

# 2. Replace everything not in top_1 with "Other"
data['ethnicity'] = data['ethnicity'].apply(
    lambda x: x if x in top_1 else "Other"
)

In [73]:
# 5. Separate features (X) and target (y)
X_final["gender"] = data["gender"]
X_final["ethnicity"] = data["ethnicity"]

X = X_final
y = data["age"]

# Just to confirm we don't have these columns in X:
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (2933, 21)
Target shape: (2933,)


In [74]:
# Potential categorical columns
categorical_cols = ["gender", "ethnicity"]

# 'tempo' is now numeric (float), I include it with other numeric columns
all_feature_cols = X.columns.tolist()
numeric_cols_for_pipeline = [col for col in all_feature_cols if col not in categorical_cols]

In [75]:
# Update the numeric transformer to include imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # or 'median'
    ('scaler', StandardScaler())
])

# Categorical transformer
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Combined column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols_for_pipeline),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [76]:
max_iter_options = [100, 200, 300]
learning_rate_options = [0.01, 0.05, 0.1]
max_depth_options = [3, 5, 7, None]
min_samples_leaf_options = [10, 20]

best_rmse = float("inf")
best_params = None
best_model = None
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1,     # 10% validation split
)
# Defining hist_pipeline outside the loop
hist_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("hist_gbr", HistGradientBoostingRegressor(random_state=42))
])

In [77]:
for max_iter in max_iter_options:
    for lr in learning_rate_options:
        for depth in max_depth_options:
            for leaf in min_samples_leaf_options:
                # Create a fresh pipeline each iteration
                hist_pipeline = Pipeline(steps=[
                    ("preprocessor", preprocessor),
                    ("hist_gbr", HistGradientBoostingRegressor(random_state=42))
                ])

                params = {
                    "hist_gbr__max_iter": max_iter,
                    "hist_gbr__learning_rate": lr,
                    "hist_gbr__max_depth": depth,
                    "hist_gbr__min_samples_leaf": leaf,
                }
                hist_pipeline.set_params(**params)

                # Fit on training data
                hist_pipeline.fit(X_train, y_train)

                # Predict on validation data
                y_val_pred = hist_pipeline.predict(X_val)

                # Compute RMSE
                mse = mean_squared_error(y_val, y_val_pred)
                rmse = sqrt(mse)

                # Check if this is the best so far
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params = params
                    # Store THIS pipeline (with current fitted state)
                    best_model = hist_pipeline

# Final: print the best parameters and RMSE
print("Best Params:", best_params)
print(f"Best Validation RMSE: {best_rmse:.4f}")

# Optional: save the best HistGradientBoosting model
joblib.dump(best_model, "best_hist_gbr.pkl")

Best Params: {'hist_gbr__max_iter': 300, 'hist_gbr__learning_rate': 0.01, 'hist_gbr__max_depth': 5, 'hist_gbr__min_samples_leaf': 20}
Best Validation RMSE: 10.1285


['best_hist_gbr.pkl']

In [78]:
# Example hyperparameter grid
kernel_options = ["rbf", "poly"]
C_options = [1, 10, 100]
gamma_options = [0.1, 0.01, "auto"]

best_rmse = float("inf")
best_params = None
best_model = None

for kernel in kernel_options:
    for C_value in C_options:
        for gamma_val in gamma_options:
            # 1. Build a fresh pipeline for each iteration
            svr_pipeline = Pipeline(steps=[
                ("preprocessor", preprocessor),  # your ColumnTransformer or numeric/categorical pipeline
                ("svr", SVR())
            ])

            # 2. Set params for SVR within the pipeline
            params = {
                "svr__kernel": kernel,
                "svr__C": C_value,
                "svr__gamma": gamma_val
            }
            svr_pipeline.set_params(**params)

            # 3. Fit on training data
            svr_pipeline.fit(X_train, y_train)

            # 4. Predict on validation data
            y_val_pred_svr = svr_pipeline.predict(X_val)

            # 5. Compute RMSE
            mse_svr = mean_squared_error(y_val, y_val_pred_svr)
            rmse_svr = sqrt(mse_svr)

            # 6. Check if this is the best so far
            if rmse_svr < best_rmse:
                best_rmse = rmse_svr
                best_params = params
                # Store this trained pipeline in its current state
                best_model = svr_pipeline

# Final: print the best parameters and RMSE
print("Best SVR Params:", best_params)
print(f"Best SVR Validation RMSE: {best_rmse:.4f}")

# Optional: save the best SVR model
joblib.dump(best_model, "best_svr_model.pkl")

Best SVR Params: {'svr__kernel': 'rbf', 'svr__C': 100, 'svr__gamma': 0.01}
Best SVR Validation RMSE: 10.5468


['best_svr_model.pkl']

In [89]:
eval_data_path = "/content/drive/MyDrive/DSL_Winter_Project_2025/evaluation_with_core_features.csv"
eval_data = pd.read_csv(eval_data_path)
# Clean & transform
eval_data['gender'] = eval_data['gender'].replace({'famale': 'female'})
eval_data["tempo"] = eval_data["tempo"].apply(parse_tempo)
eval_data["ethnicity"] = eval_data["ethnicity"].apply(
    lambda x: x if x in top_1 else "Other"
)

# Select final columns
final_cols = [
    "silence_duration", "hnr", "num_pauses", "max_pitch", "min_pitch",
    "jitter.1", "zcr_mean", "jitter", "pitch_std", "pause_duration_std",
    "shimmer", "mfcc2_mean", "f3_mean", "spectral_flux", "energy",
    "rolloff_75", "f2_mean", "shimmer.1", "speech_rate", "gender",
    "ethnicity"
]
X_eval = eval_data[final_cols]

In [86]:
# Load Hist model
best_model = joblib.load("best_hist_gbr.pkl")
# Predict
preds = best_model.predict(X_eval)

# Create submission
submission = pd.DataFrame({
    "Id": eval_data["Id"],
    "Predicted": preds.round()
})
submission.to_csv("submission_hist.csv", index=False)
print("Submission file saved as 'submission_hist.csv'")

Submission file saved as 'submission_hist.csv'


In [87]:
# Load SVR model
best_model = joblib.load("best_svr_model.pkl")

# Predict
y_eval_pred = best_model.predict(X_eval)

# Create submission
submission = pd.DataFrame({
    "Id": eval_data["Id"],
    # Round to nearest integer
    "Predicted": y_eval_pred.round()
})
submission.to_csv("submission_svr.csv", index=False)