## Emotion classifier using semi-supervised learning 

This notebook contains methods for utilizing semi supervised learning to label the GTZAN dataset with moods for each entry.

In [2]:
# import required libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

path_to_csv = 'gtzan-dataset-music-genre-classification/features_30_sec.csv'

The idea is to manually label 150 samples, obtained using stratified sampling.

In [3]:
features_df = pd.read_csv(path_to_csv)
features_df

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.165850,129774.064525,2002.449060,85882.761315,...,52.420910,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.094980,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.106190,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.175570,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.439720,46.639660,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.319130,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.195160,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,rock.00095.wav,661794,0.352063,0.080487,0.079486,0.000345,2008.149458,282174.689224,2106.541053,88609.749506,...,45.050526,-13.289984,41.754955,2.484145,36.778877,-6.713265,54.866825,-1.193787,49.950665,rock
996,rock.00096.wav,661794,0.398687,0.075086,0.076458,0.000588,2006.843354,182114.709510,2068.942009,82426.016726,...,33.851742,-10.848309,39.395096,1.881229,32.010040,-7.461491,39.196327,-2.795338,31.773624,rock
997,rock.00097.wav,661794,0.432142,0.075268,0.081651,0.000322,2077.526598,231657.968040,1927.293153,74717.124394,...,33.597008,-12.845291,36.367264,3.440978,36.001110,-12.588070,42.502201,-2.106337,29.865515,rock
998,rock.00098.wav,661794,0.362485,0.091506,0.083860,0.001211,1398.699344,240318.731073,1818.450280,109090.207161,...,46.324894,-4.416050,43.583942,1.556207,34.331261,-5.041897,47.227180,-3.590644,41.299088,rock


In [None]:
sampled_df = features_df[['filename', 'label']].groupby('label', group_keys=False).apply(lambda x: x.sample(15), include_groups=False).sample(frac=1)
sampled_df["mood"] = [''] * 150
sampled_df.to_csv('gtzan-dataset-music-genre-classification/sampled_features.csv')

Now lets manually label the data :)

In [4]:
path_to_labeled_csv = ''
labeled_df = pd.read_csv('gtzan-dataset-music-genre-classification/sampled_features_labeled.csv', index_col=0)
labeled_df.head()

Unnamed: 0,filename,mood
798,pop.00098.wav,happy
274,country.00074.wav,sad
503,jazz.00003.wav,relaxed
143,classical.00043.wav,happy
470,hiphop.00070.wav,happy


Before we merge our labels back to our data, lets drop irrelevant features, and normalize the remaining ones.

In [5]:
X = features_df.drop(['filename', 'length', 'label'], axis=1)
var_columns = [col for col in X.columns if col.endswith('var')]
X = X.drop(var_columns, axis=1)

# Normalize features
scaler = StandardScaler()
X_normalized = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)
X_normalized.head()

Unnamed: 0,chroma_stft_mean,rms_mean,spectral_centroid_mean,spectral_bandwidth_mean,rolloff_mean,zero_crossing_rate_mean,harmony_mean,perceptr_mean,tempo,mfcc1_mean,...,mfcc11_mean,mfcc12_mean,mfcc13_mean,mfcc14_mean,mfcc15_mean,mfcc16_mean,mfcc17_mean,mfcc18_mean,mfcc19_mean,mfcc20_mean
0,-0.350137,-0.01069,-0.583585,-0.456402,-0.486473,-0.492045,0.190503,0.372757,0.125363,0.30847,...,-0.338368,0.645385,0.18219,0.792287,-0.265558,-0.086396,0.500462,-0.23704,0.006723,0.604064
1,-0.462482,-0.532852,-0.938516,-0.386852,-0.648681,-1.138098,0.300346,0.201327,-1.823211,-0.629159,...,0.068309,0.13477,0.415775,0.487075,-0.439903,-0.048102,0.711371,-0.054982,0.544806,0.424127
2,-0.184225,0.679978,-0.906885,-0.940663,-0.971592,-0.653615,0.218667,0.347559,1.486499,0.536542,...,-1.043902,0.196577,-0.666723,-0.571108,-1.100347,0.284825,-0.827516,-0.600983,-0.295934,-0.296189
3,0.319639,0.15481,-1.581429,-1.228256,-1.51639,-1.681901,0.217688,0.352167,-1.999337,-0.549727,...,0.413939,0.2395,0.375494,-0.571569,0.204986,-0.080249,0.142153,0.033427,0.455409,-0.602881
4,-0.859077,-0.600165,-0.512542,-0.93977,-0.630107,-0.051477,0.206982,0.356208,0.583851,-0.158363,...,-1.037241,-1.085384,-0.346598,-1.864732,-1.087083,-1.238616,-0.32722,-0.368375,-0.553016,-2.766009


Let now add back the features to our labelled samples

In [6]:
merged_df = X_normalized.loc[labeled_df.index]
merged_df['mood'] = labeled_df['mood']
merged_df.head()

Unnamed: 0,chroma_stft_mean,rms_mean,spectral_centroid_mean,spectral_bandwidth_mean,rolloff_mean,zero_crossing_rate_mean,harmony_mean,perceptr_mean,tempo,mfcc1_mean,...,mfcc12_mean,mfcc13_mean,mfcc14_mean,mfcc15_mean,mfcc16_mean,mfcc17_mean,mfcc18_mean,mfcc19_mean,mfcc20_mean,mood
798,0.257711,1.252772,2.000091,1.694106,1.88102,1.89943,0.222611,0.528158,-0.07262,1.173858,...,-0.47757,1.399672,0.579387,1.145365,-0.464331,0.587516,-0.822962,0.282528,-0.362729,happy
274,-0.289684,-0.740177,-0.933137,-1.03691,-0.92363,-0.777132,0.216696,0.348582,0.583851,-0.626301,...,0.051006,-1.207152,-0.558201,-0.138914,-1.577215,-0.344353,-0.742973,-1.121124,-1.253208,sad
503,-0.72771,-1.088534,-1.940627,-2.250055,-2.00444,-1.410583,0.21633,0.328346,-0.963546,-1.525731,...,-0.611075,-0.027352,0.290744,-0.123965,-0.664221,0.372603,0.872785,0.711823,-0.293716,relaxed
143,-2.298005,-0.237442,-1.591255,-1.312177,-1.881072,-1.258454,0.896541,0.784356,0.343145,-1.049731,...,-1.733248,-0.499833,-0.772274,0.196879,-0.438777,-0.001913,-0.36145,-0.443598,-0.728949,happy
470,0.639423,2.254631,0.417584,1.533743,0.925177,-0.822692,0.207518,0.427738,-0.842556,0.626337,...,-0.436777,0.659508,-0.08631,0.713537,-0.934801,0.763397,-0.323293,-0.383785,-0.368897,happy


Now lets get our remaining data ready for labeling.

In [7]:
X_unlabeled = X_normalized.drop(labeled_df.index)
X_unlabeled.head()

Unnamed: 0,chroma_stft_mean,rms_mean,spectral_centroid_mean,spectral_bandwidth_mean,rolloff_mean,zero_crossing_rate_mean,harmony_mean,perceptr_mean,tempo,mfcc1_mean,...,mfcc11_mean,mfcc12_mean,mfcc13_mean,mfcc14_mean,mfcc15_mean,mfcc16_mean,mfcc17_mean,mfcc18_mean,mfcc19_mean,mfcc20_mean
0,-0.350137,-0.01069,-0.583585,-0.456402,-0.486473,-0.492045,0.190503,0.372757,0.125363,0.30847,...,-0.338368,0.645385,0.18219,0.792287,-0.265558,-0.086396,0.500462,-0.23704,0.006723,0.604064
1,-0.462482,-0.532852,-0.938516,-0.386852,-0.648681,-1.138098,0.300346,0.201327,-1.823211,-0.629159,...,0.068309,0.13477,0.415775,0.487075,-0.439903,-0.048102,0.711371,-0.054982,0.544806,0.424127
2,-0.184225,0.679978,-0.906885,-0.940663,-0.971592,-0.653615,0.218667,0.347559,1.486499,0.536542,...,-1.043902,0.196577,-0.666723,-0.571108,-1.100347,0.284825,-0.827516,-0.600983,-0.295934,-0.296189
3,0.319639,0.15481,-1.581429,-1.228256,-1.51639,-1.681901,0.217688,0.352167,-1.999337,-0.549727,...,0.413939,0.2395,0.375494,-0.571569,0.204986,-0.080249,0.142153,0.033427,0.455409,-0.602881
4,-0.859077,-0.600165,-0.512542,-0.93977,-0.630107,-0.051477,0.206982,0.356208,0.583851,-0.158363,...,-1.037241,-1.085384,-0.346598,-1.864732,-1.087083,-1.238616,-0.32722,-0.368375,-0.553016,-2.766009


In [150]:
X_labeled = merged_df.drop('mood', axis=1)
y_labeled = merged_df['mood']

In [159]:
def semi_supervised_labeling(X_labeled, y_labeled, X_unlabeled, 
                           initial_threshold=0.85, min_threshold=0.75, threshold_decay=0.02):
    
    current_threshold = initial_threshold
    results_history = []

    while current_threshold >= min_threshold:
        # Train model on current labeled data
        
        # Option 2: Logistic Regression
        clf = LogisticRegression(
            max_iter=1000,
            random_state=42,
            solver='newton-cg'
        )
        clf.fit(X_labeled, y_labeled)
        
        # Get predictions and confidence scores for unlabeled data
        predictions = clf.predict(X_unlabeled)
        confidence_scores = np.max(clf.predict_proba(X_unlabeled), axis=1)
        
        # Find high confidence predictions
        high_confidence_mask = confidence_scores >= initial_threshold

        if not any(high_confidence_mask):
            current_threshold -= threshold_decay
            print(f"No confident predictions found. Lowering threshold to {current_threshold:.2f}")
            continue
            
        # Create new labeled data with correct indices
        confident_indices = X_unlabeled[high_confidence_mask].index
        new_X_labeled = X_unlabeled.loc[confident_indices]
        new_y_labeled = pd.Series(predictions[high_confidence_mask], index=confident_indices)
        
        # Concatenate while preserving indices
        X_labeled = pd.concat([X_labeled, new_X_labeled])
        y_labeled = pd.concat([y_labeled, new_y_labeled])
        
        # Remove newly labeled samples using index-based selection
        X_unlabeled = X_unlabeled.drop(confident_indices)

        # Store results for this iteration
        results_history.append({
            'threshold': current_threshold,
            'samples_added': len(high_confidence_mask),
            'total_labeled': len(X_labeled),
            'confidence_mean': confidence_scores[high_confidence_mask].mean()
        })
        
        print(f"Added {sum(high_confidence_mask)} samples. Total labeled: {len(X_labeled)}")
        
    return X_labeled, y_labeled, X_unlabeled, results_history

# Run the iterative process
final_X_labeled, final_y_labeled, remaining_unlabeled, results = semi_supervised_labeling(
    X_labeled, y_labeled, X_unlabeled
)

Added 122 samples. Total labeled: 272
Added 98 samples. Total labeled: 370
Added 59 samples. Total labeled: 429
Added 36 samples. Total labeled: 465
Added 18 samples. Total labeled: 483
Added 12 samples. Total labeled: 495
Added 9 samples. Total labeled: 504
Added 3 samples. Total labeled: 507
Added 3 samples. Total labeled: 510
Added 2 samples. Total labeled: 512
Added 2 samples. Total labeled: 514
No confident predictions found. Lowering threshold to 0.83
No confident predictions found. Lowering threshold to 0.81
No confident predictions found. Lowering threshold to 0.79
No confident predictions found. Lowering threshold to 0.77
No confident predictions found. Lowering threshold to 0.75


Now lets train a classifier based on our labeled data, and measure its performance!

In [162]:
X_train, X_test, y_train, y_test = train_test_split(final_X_labeled, final_y_labeled, test_size=0.2, random_state=42)

clf = LogisticRegression(
            max_iter=1000,
            random_state=42,
            solver='newton-cg'
        )
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, predictions))





Accuracy: 0.8349514563106796

Classification Report:
              precision    recall  f1-score   support

       angry       0.90      0.86      0.88        21
       happy       0.89      0.96      0.92        57
     relaxed       0.75      0.40      0.52        15
         sad       0.54      0.70      0.61        10

    accuracy                           0.83       103
   macro avg       0.77      0.73      0.73       103
weighted avg       0.84      0.83      0.83       103



People say numbers don't lie, I believe they sometimes might. So lets sample some of our predictions and manually check if they make sense!

In [202]:
checking_df = features_df.loc[final_y_labeled.index]['filename']
checking_df = pd.DataFrame(checking_df)
checking_df['mood'] = final_y_labeled
checking_df = checking_df.drop(labeled_df.index)
checking_df.sample()

Unnamed: 0,filename,mood
224,country.00024.wav,sad
