## Emotion classifier using semi-supervised learning 

This notebook contains methods for utilizing semi supervised learning to label the GTZAN dataset with moods for each entry.

In [204]:
# import required libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import joblib


path_to_csv = 'gtzan-dataset-music-genre-classification/features_30_sec.csv'

The idea is to manually label 150 samples, obtained using stratified sampling.

In [None]:
features_df = pd.read_csv(path_to_csv)
features_df

In [None]:
sampled_df = features_df[['filename', 'label']].groupby('label', group_keys=False).apply(lambda x: x.sample(15), include_groups=False).sample(frac=1)
sampled_df["mood"] = [''] * 150
sampled_df.to_csv('gtzan-dataset-music-genre-classification/sampled_features.csv')

Now lets manually label the data :)

In [None]:
path_to_labeled_csv = ''
import pandas as pd
labeled_df = pd.read_csv('gtzan-dataset-music-genre-classification/sampled_features_labeled.csv', index_col=0)
labeled_df.mood.unique()

Before we merge our labels back to our data, lets drop irrelevant features, and normalize the remaining ones.

In [None]:
X = features_df.drop(['filename', 'length', 'label'], axis=1)
var_columns = [col for col in X.columns if col.endswith('var')]
X = X.drop(var_columns, axis=1)

# Normalize features
scaler = StandardScaler()
X_normalized = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

joblib.dump(scaler, 'scaler.pkl')

X_normalized.head()

Let now add back the features to our labelled samples

In [None]:
merged_df = X_normalized.loc[labeled_df.index]
merged_df['mood'] = labeled_df['mood']
merged_df.head()

Now lets get our remaining data ready for labeling.

In [None]:
X_unlabeled = X_normalized.drop(labeled_df.index)
X_unlabeled.head()

In [None]:
X_labeled = merged_df.drop('mood', axis=1)
y_labeled = merged_df['mood']
X_labeled.columns

In [None]:
def semi_supervised_labeling(X_labeled, y_labeled, X_unlabeled, 
                           initial_threshold=0.85, min_threshold=0.75, threshold_decay=0.02):
    
    current_threshold = initial_threshold
    results_history = []

    while current_threshold >= min_threshold:
        # Train model on current labeled data
        
        # Option 2: Logistic Regression
        clf = LogisticRegression(
            max_iter=1000,
            random_state=42,
            solver='newton-cg'
        )
        clf.fit(X_labeled, y_labeled)
        
        # Get predictions and confidence scores for unlabeled data
        predictions = clf.predict(X_unlabeled)
        confidence_scores = np.max(clf.predict_proba(X_unlabeled), axis=1)
        
        # Find high confidence predictions
        high_confidence_mask = confidence_scores >= initial_threshold

        if not any(high_confidence_mask):
            current_threshold -= threshold_decay
            print(f"No confident predictions found. Lowering threshold to {current_threshold:.2f}")
            continue
            
        # Create new labeled data with correct indices
        confident_indices = X_unlabeled[high_confidence_mask].index
        new_X_labeled = X_unlabeled.loc[confident_indices]
        new_y_labeled = pd.Series(predictions[high_confidence_mask], index=confident_indices)
        
        # Concatenate while preserving indices
        X_labeled = pd.concat([X_labeled, new_X_labeled])
        y_labeled = pd.concat([y_labeled, new_y_labeled])
        
        # Remove newly labeled samples using index-based selection
        X_unlabeled = X_unlabeled.drop(confident_indices)

        # Store results for this iteration
        results_history.append({
            'threshold': current_threshold,
            'samples_added': len(high_confidence_mask),
            'total_labeled': len(X_labeled),
            'confidence_mean': confidence_scores[high_confidence_mask].mean()
        })
        
        print(f"Added {sum(high_confidence_mask)} samples. Total labeled: {len(X_labeled)}")
        
    return X_labeled, y_labeled, X_unlabeled, results_history

# Run the iterative process
final_X_labeled, final_y_labeled, remaining_unlabeled, results = semi_supervised_labeling(
    X_labeled, y_labeled, X_unlabeled
)

Now lets train a classifier based on our labeled data, and measure its performance!

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_X_labeled, final_y_labeled, test_size=0.2, random_state=42)

clf = LogisticRegression(
            max_iter=1000,
            random_state=42,
            solver='newton-cg'
        )
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, predictions))





People say numbers don't lie, I believe they sometimes might. So lets sample some of our predictions and manually check if they make sense!

In [None]:
checking_df = features_df.loc[final_y_labeled.index]['filename']
checking_df = pd.DataFrame(checking_df)
checking_df['mood'] = final_y_labeled
checking_df = checking_df.drop(labeled_df.index)
checking_df.sample()

Looks pretty good to me :)

In [None]:

# Save the model
joblib.dump(clf, 'emotion_classifier.joblib')