In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Mounted at /content/gdrive


In [2]:
from IPython.display import clear_output

In [3]:
# !pip install torch torchvision
!pip install scikit-learn
!pip install openpyxl
!pip install lassonet
clear_output()

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectFromModel

from lassonet import LassoNetClassifier



# Cloning JMIM

In [33]:
!git clone -q https://github.com/danielhomola/mifs.git
%cd /content/mifs/
!pip install -q .

/content/mifs
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for mifs (setup.py) ... [?25l[?25hdone


In [35]:
import mifs

# Load data

In [5]:
def load_data(train_path, test_path):
    # Load your data
    train_df = pd.read_csv(train_path, header=[0, 1]).fillna(0)
    test_df = pd.read_csv(test_path, header=[0, 1]).fillna(0)

    train_df.replace([np.inf, -np.inf], 0, inplace=True)
    test_df.replace([np.inf, -np.inf], 0, inplace=True)

    return train_df, test_df


def to_array(train_df, test_df, group=False):
    if not group:
        train_df.columns = train_df.columns.droplevel(level=0)
        test_df.columns = test_df.columns.droplevel(level=0)

    # Prepare the datasets
    X_train = train_df.drop(['label', 'filename'], axis=1).values
    y_train = train_df['label'].values
    X_val = test_df.drop(['label', 'filename'], axis=1).values
    y_val = test_df['label'].values

    return X_train, y_train, X_val, y_val

def to_array_2(train_df, test_df, group=False):
    if not group:
        train_df.columns = train_df.columns.droplevel(level=0)
        test_df.columns = test_df.columns.droplevel(level=0)

    # Prepare the datasets
    X_train = train_df.values
    X_val = test_df.values

    return X_train, X_val


def stylize(df, column_name):
    def color_gradient(val, min_val, max_val):
        # Normalize value
        normalized = (val - min_val) / ((max_val - min_val)+5e-12)
        # Calculate green intensity
        green_intensity = int(255 * normalized)
        return f'background-color: rgb({255 - green_intensity}, {255}, {255 - green_intensity})'

    #
    # Apply the color mapping
    styled_df = df.style.applymap(lambda x: color_gradient(x, df[column_name].min(), df[column_name].max()))
    return styled_df


In [6]:
def svm(X_train, y_train, X_test, y_test):
    clf = SVC()
    clf.fit(X_train, y_train)
    all_predictions = clf.predict(X_test)
    f1 = f1_score(y_test, all_predictions)

    return f1

# Feature Selection

**1. All features**

In [7]:
def replace_outliers_with_average(data):
    rows, cols = data.shape
    for col in range(cols):
        column_data = data[:, col]
        Q1 = np.quantile(column_data, 0.01)
        Q3 = np.quantile(column_data, 0.99)
        # IQR = Q3 - Q1
        lower_bound = Q1# - 1.5 * IQR
        upper_bound = Q3# + 1.5 * IQR

        # Identify outliers
        outliers_mask = (column_data < lower_bound) | (column_data > upper_bound)

        # Calculate the average of non-outliers
        non_outliers_avg = column_data[~outliers_mask].mean()
        # print(non_outliers_avg)

        # Replace outliers with the average of non-outliers
        data[outliers_mask, col] = non_outliers_avg

    return data

In [8]:
train_path = '/content/gdrive/MyDrive/speech_analysis/train_with_groups.csv'
test_path = '/content/gdrive/MyDrive/speech_analysis/test_with_groups.csv'

train_df, test_df = load_data(train_path, test_path)
X_train_all_features, y_train, X_test_all_features, y_test = to_array(train_df.copy(), test_df.copy(), group=False)
group_names = set(train_df.columns.droplevel(level=1))
group_names.remove("Info")
feature_names = train_df.columns.droplevel(level=0).tolist()[:-2]


# X_train_all_features = replace_outliers_with_average(X_train_all_features.copy())
# X_test_all_features = replace_outliers_with_average(X_test_all_features.copy())
# Normalize features
scaler = MinMaxScaler()
X_train_all_features = scaler.fit_transform(X_train_all_features)
X_test_all_features = scaler.transform(X_test_all_features)

In [9]:
feature_importances_df = pd.DataFrame(columns=train_df.columns[:-2])

In [10]:
X_train_all_features.shape

(166, 6859)

In [11]:
X_test_all_features.shape

(71, 6859)

**2. PCA**

In [12]:
# 129 150 166
pca = PCA(n_components=166)
X_train_pca = pca.fit_transform(X_train_all_features)
X_test_pca = pca.transform(X_test_all_features)

**3. JMIM**

In [37]:
def jmim(df_eg, y_train):
    t = MinMaxScaler()
    X_eg_t = t.fit_transform(df_eg.to_numpy())

    MIFS1 = mifs.MutualInformationFeatureSelector(method='JMIM', k=3, n_features="auto", verbose=0)
    MIFS1.fit(X_eg_t, y_train.astype(int))

    return MIFS1.ranking_, MIFS1.mi_

In [39]:
np.bool = bool

In [52]:
df_eg_copy = train_df.copy()
df_eg_copy = df_eg_copy.drop(["Info"], axis=1)
group_list = df_eg_copy.columns.droplevel(level=1)
df_eg_copy.columns = df_eg_copy.columns.droplevel(level=0)

ranking, mi = jmim(df_eg_copy, y_train)

selected_columns = [(group, feature_name) for group, feature_name in zip(group_list[ranking], df_eg_copy.columns[ranking])]

  df_eg_copy = df_eg_copy.drop(["Info"], axis=1)


In [53]:
X_train_JMIM = train_df[selected_columns].to_numpy()
X_test_JMIM = test_df[selected_columns].to_numpy()

scaler = MinMaxScaler()
X_train_JMIM = scaler.fit_transform(X_train_JMIM)
X_test_JMIM = scaler.transform(X_test_JMIM)

**4. Lasso (L1 Regulizer)**

In [13]:
def lasso(X_train, X_test, y_train, ratio=1):
    lsvc = LinearSVC(C=ratio, penalty="l1", dual=False).fit(X_train, y_train)

    model = SelectFromModel(lsvc, prefit=True)
    X_train_lasso = model.transform(X_train)
    X_test_lasso = model.transform(X_test)

    return X_train_lasso, X_test_lasso, np.abs(lsvc.coef_)

In [14]:
for group in group_names:
    X_train, X_test = to_array_2(train_df[group].copy(), test_df[group].copy(), group=True)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    X_train_lasso, X_test_lasso, feature_importances = lasso(X_train, X_test, y_train)

    selected_columns = [(group, feature_name) for feature_name in train_df[group].columns]
    feature_importances_df.loc["LASSO_GP_Based", selected_columns] = feature_importances.reshape(-1)



In [15]:
X_train_lasso, X_test_lasso, feature_importances = lasso(X_train_all_features, X_test_all_features, y_train)
feature_importances_df.loc["LASSO", :] = feature_importances



In [16]:
feature_importances_df

Unnamed: 0_level_0,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,...,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity
Unnamed: 0_level_1,count_pause_segments,hesitation_rate,num_words_to_pauses,pasue_speech_ratio,pause_length,pause_lengths_avg,pause_speech_duration_ratio,pause_to_syllable,pause_to_tokens,pause_totallength_ratio,...,AMP_ENTROPY_sma_de_quartile2,AMP_ENTROPY_sma_de_quartile3,AMP_ENTROPY_sma_de_iqr1_2,AMP_ENTROPY_sma_de_iqr2_3,AMP_ENTROPY_sma_de_iqr1_3,AMP_ENTROPY_sma_de_percentile1,AMP_ENTROPY_sma_de_percentile99,AMP_ENTROPY_sma_de_pctlrange0_1,AMP_ENTROPY_sma_de_upleveltime75,AMP_ENTROPY_sma_de_upleveltime90
LASSO_GP_Based,0.613281,1.483391,0.472253,0.0,1.246139,0.0,0.242905,0.952145,0.0,1.607245,...,1.599886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.36727
LASSO,0.0,0.25371,0.0,0.0,0.070553,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**5. Lassonet**

In [17]:
def lasso_net(X_train, X_test, y_train):
    model = LassoNetClassifier(M=10, hidden_dims=(50,), path_multiplier=1.1, verbose=False)
    path = model.path(X_train, y_train, return_state_dicts=True)

    return model.feature_importances_.numpy()

In [18]:
for group in group_names:
    X_train, X_test = to_array_2(train_df[group].copy(), test_df[group].copy(), group=True)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    feature_importances = lasso_net(X_train, X_test, y_train)

    selected_columns = [(group, feature_name) for feature_name in train_df[group].columns]
    feature_importances_df.loc["LASSONET_GP_Based", selected_columns] = feature_importances.reshape(-1)

In [19]:
feature_importances = lasso_net(X_train_all_features, X_test_all_features, y_train)
feature_importances_df.loc["LASSONET", :] = feature_importances

In [20]:
feature_importances_df

Unnamed: 0_level_0,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,...,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity
Unnamed: 0_level_1,count_pause_segments,hesitation_rate,num_words_to_pauses,pasue_speech_ratio,pause_length,pause_lengths_avg,pause_speech_duration_ratio,pause_to_syllable,pause_to_tokens,pause_totallength_ratio,...,AMP_ENTROPY_sma_de_quartile2,AMP_ENTROPY_sma_de_quartile3,AMP_ENTROPY_sma_de_iqr1_2,AMP_ENTROPY_sma_de_iqr2_3,AMP_ENTROPY_sma_de_iqr1_3,AMP_ENTROPY_sma_de_percentile1,AMP_ENTROPY_sma_de_percentile99,AMP_ENTROPY_sma_de_pctlrange0_1,AMP_ENTROPY_sma_de_upleveltime75,AMP_ENTROPY_sma_de_upleveltime90
LASSO_GP_Based,0.613281,1.483391,0.472253,0.0,1.246139,0.0,0.242905,0.952145,0.0,1.607245,...,1.599886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.36727
LASSO,0.0,0.25371,0.0,0.0,0.070553,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LASSONET_GP_Based,111.424141,163.136078,111.424141,101.29467,134.823212,111.424141,122.566551,122.566551,111.424141,122.566551,...,67.411606,41.857304,46.043034,46.043034,34.592812,41.857304,34.592812,41.857304,31.448011,55.71207
LASSONET,7.660409,62.357841,5.755379,5.232163,46.850368,6.964009,4.756512,7.660409,6.330917,7.660409,...,6.964009,4.324101,5.232163,4.756512,4.324101,4.756512,5.232163,4.756512,5.232163,5.232163


# Results

**1. All features**

In [23]:
svm(X_train_all_features, y_train, X_test_all_features, y_test)

0.6944444444444444

**2. PCA**

In [25]:
svm(X_train_pca, y_train, X_test_pca, y_test)

0.7567567567567568

**3. JMIM**

In [54]:
svm(X_train_JMIM, y_train, X_test_JMIM, y_test)

0.6567164179104478

**4. Lasso (L1 Regulizer)**

In [27]:
svm(X_train_lasso, y_train, X_test_lasso, y_test)

0.6875

**5. Lassonet**

In [28]:
def lasso_net_res(X_train, X_test, y_train):
    model = LassoNetClassifier(M=10, hidden_dims=(50,), path_multiplier=1.1, verbose=False)
    path = model.path(X_train, y_train, return_state_dicts=True)

    return model, path

In [None]:
model, path = lasso_net_res(X_train_all_features, X_test_all_features, y_train)
f1s = []
for save in path:
    model.load(save.state_dict)
    y_pred = model.predict(X_test_all_features)
    f1s.append(f1_score(y_test, y_pred))

In [32]:
np.mean(f1s)

0.7388791191655525