In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# from google.colab import drive
# drive.mount('/content/drive')
# import pandas as pd

In [None]:
# Update this path to match your file's location in Google Drive
# file_path = '/content/drive/MyDrive/features.csv'
# df = pd.read_csv(file_path, on_bad_lines='skip')
df = pd.read_csv('features.csv', on_bad_lines='skip')
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())
print("\nLabel distribution:")
print(df['label'].value_counts())
df

In [None]:
df_clean = df.dropna(subset=['energy'], how='all')

print(f"Original shape: {df.shape}, Cleaned shape: {df_clean.shape}")

In [None]:
# Calculate signal-to-noise ratio (approximation)
df_clean['snr'] = df_clean['rms_mean'] / (df_clean['rms_std'] + 1e-10)

# Create a "vocal range" feature
df_clean['vocal_range'] = df_clean['pitch_max'] - df_clean['pitch_min']

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='label', data=df_clean)
plt.title('Distribution of Labels')
plt.show()

In [None]:
# label_0_indices = df_clean[df_clean['label'] == 0].index[:100000]
# df_clean = df_clean.drop(label_0_indices)

In [None]:
import pandas as pd
from sklearn.utils import resample

# Make a copy of the original dataframe
df_balanced = df_clean.copy()

# 1. Balance 'label' categories
df_majority_1 = df_clean[df_clean.label == 0]
df_class_1 = df_clean[df_clean.label == 1]
df_class_2 = df_clean[df_clean.label == 2]
df_class_3 = df_clean[df_clean.label == 3]

n_samples_1 = len(df_majority_1)

# Upsample minority classes for 'label'
df_class_1_upsampled = resample(df_class_1, replace=True, n_samples=n_samples_1, random_state=42)
df_class_2_upsampled = resample(df_class_2, replace=True, n_samples=n_samples_1, random_state=42)
df_class_3_upsampled = resample(df_class_3, replace=True, n_samples=n_samples_1, random_state=42)

# Create balanced dataframe for 'label'
df_label_balanced = pd.concat([df_majority_1, df_class_1_upsampled, df_class_2_upsampled, df_class_3_upsampled])

# 2. Balance 'gender' categories
df_majority_2 = df_label_balanced[df_label_balanced.gender == "male"]
df_class_5 = df_label_balanced[df_label_balanced.gender == 'female']

n_samples_2 = max(len(df_majority_2), len(df_class_5))

# Upsample minority class for 'gender'
if len(df_majority_2) < len(df_class_5):
    df_majority_2 = resample(df_majority_2, replace=True, n_samples=n_samples_2, random_state=42)
else:
    df_class_5_upsampled = resample(df_class_5, replace=True, n_samples=n_samples_2, random_state=42)

# Create balanced dataframe for 'gender'
df_gender_balanced = pd.concat([df_majority_2, df_class_5_upsampled if len(df_majority_2) >= len(df_class_5) else df_class_5])

# 3. Balance 'age' categories
df_majority_3 = df_gender_balanced[df_gender_balanced.age == "twenties"]
df_class_7 = df_gender_balanced[df_gender_balanced.age == 'fifties']

n_samples_3 = max(len(df_majority_3), len(df_class_7))

# Upsample minority class for 'age'
if len(df_majority_3) < len(df_class_7):
    df_majority_3 = resample(df_majority_3, replace=True, n_samples=n_samples_3, random_state=42)
else:
    df_class_7_upsampled = resample(df_class_7, replace=True, n_samples=n_samples_3, random_state=42)

# Create final balanced dataframe
df_clean = pd.concat([df_majority_3, df_class_7_upsampled if len(df_majority_3) >= len(df_class_7) else df_class_7])

# Shuffle the dataset
df_clean = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
sns.countplot(x='label', data=df_clean)
plt.title('Balanced Distribution of Labels (After Oversampling)')
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='gender', data=df_clean)
plt.title('Distribution of Labels')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='age', data=df_clean)
plt.title('Distribution of Labels')
plt.show()

In [None]:
features_to_plot = ['energy', 'pitch_mean', 'spectral_centroid_mean', 'tempo', 'zcr_mean']

plt.figure(figsize=(15,10))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='label', y=feature, data=df_clean)
    plt.title(f'{feature} by Label')
plt.tight_layout()
plt.show()

In [None]:
# Calculate correlation matrix
corr_matrix = df_clean.select_dtypes(include=[np.number]).corr()

# Plot heatmap
plt.figure(figsize=(15,12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
selected_features = [
    'pitch_mean',
    'pitch_std',
    'spectral_centroid_mean',
    'spectral_bandwidth_mean',
    'spectral_rolloff_mean',
    'energy'
    # 'mfcc_1','mfcc_2','mfcc_3','mfcc_4','mfcc_5','mfcc_6','mfcc_7','mfcc_8','mfcc_9','mfcc_10','mfcc_11','mfcc_12','mfcc_13'
]

demographic_features = ['age', 'gender']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate features and target
X = df_clean[selected_features + demographic_features]
y = df_clean['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# Preprocessing pipeline
numeric_features = selected_features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = demographic_features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
X.head()


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Prepare data (using only audio features)
gender_features = X_train.drop(['age', 'gender'], axis=1)
gender_target = X_train['gender']

# Train/test split
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(
    gender_features, gender_target, test_size=0.3, random_state=42)

# Train model
gender_model = RandomForestClassifier()
gender_model.fit(X_train_gender, y_train_gender)

# Evaluate
print("Gender Accuracy:", gender_model.score(X_test_gender, y_test_gender))

In [None]:
age_features = X_train.drop(['age', 'gender'], axis=1)
age_target = X_train['age']

X_train_age, X_test_age, y_train_age, y_test_age = train_test_split(
    age_features, age_target, test_size=0.3, random_state=42)

age_model = RandomForestClassifier()
age_model.fit(X_train_age, y_train_age)

print("Age Accuracy:", age_model.score(X_test_age, y_test_age))

In [None]:
# label_features

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your dataset (replace with your actual data loading)
# df_clean = pd.read_csv("your_data.csv")

# Define features (modify as needed)
# selected_features = ['feature1', 'feature2', ...]  # Your audio features
# demographic_features = ['age', 'gender']          # Demographic features

# Separate features and target
X = df_clean[selected_features + demographic_features]
y = df_clean['label']

# Split data (stratify by label)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# ==============================================
# STEP 1: Train Demographic Predictors (Gender/Age)
# ==============================================

# Gender prediction model (using only audio features)
X_train_gender = X_train[selected_features]
y_train_gender = X_train['gender']

gender_model = RandomForestClassifier(random_state=42)
gender_model.fit(X_train_gender, y_train_gender)

# Age prediction model (using only audio features)
X_train_age = X_train[selected_features]
y_train_age = X_train['age']

age_model = RandomForestClassifier(random_state=42)
age_model.fit(X_train_age, y_train_age)

# Evaluate demographic models
print("=== Gender Model Evaluation ===")
X_test_gender = X_test[selected_features]
print(classification_report(X_test['gender'], gender_model.predict(X_test_gender)))

print("\n=== Age Model Evaluation ===")
X_test_age = X_test[selected_features]
print(classification_report(X_test['age'], age_model.predict(X_test_age)))

# ==============================================
# STEP 2: Prepare Features for Final Model
# ==============================================

# Create encoders (fit only on training data)
gender_encoder = LabelEncoder().fit(X_train['gender'])
age_encoder = LabelEncoder().fit(X_train['age'])

# Generate predicted demographics for train/test sets
X_train_pred = X_train[selected_features].copy()
X_train_pred['pred_gender'] = gender_model.predict(X_train[selected_features])
X_train_pred['pred_age'] = age_model.predict(X_train[selected_features])

X_test_pred = X_test[selected_features].copy()
X_test_pred['pred_gender'] = gender_model.predict(X_test[selected_features])
X_test_pred['pred_age'] = age_model.predict(X_test[selected_features])

# Encode predicted demographics
X_train_pred['pred_gender'] = gender_encoder.transform(X_train_pred['pred_gender'])
X_train_pred['pred_age'] = age_encoder.transform(X_train_pred['pred_age'])
X_test_pred['pred_gender'] = gender_encoder.transform(X_test_pred['pred_gender'])
X_test_pred['pred_age'] = age_encoder.transform(X_test_pred['pred_age'])

# ==============================================
# STEP 3: Train Final Model
# ==============================================

# Preprocessing pipeline
numeric_features = selected_features
categorical_features = ['pred_gender', 'pred_age']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Final model pipeline
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train
final_model.fit(X_train_pred, y_train)

# Evaluate
print("\n=== Final Model Evaluation ===")
y_pred = final_model.predict(X_test_pred)
print(classification_report(y_test, y_pred))

# ==============================================
# STEP 4: Prediction Function
# ==============================================

def predict_label(audio_features):
    """Predict label from audio features using the full pipeline"""
    # Convert to DataFrame if needed
    if not isinstance(audio_features, pd.DataFrame):
        audio_features = pd.DataFrame([audio_features], columns=selected_features)

    # Predict demographics
    pred_gender = gender_model.predict(audio_features)[0]
    pred_age = age_model.predict(audio_features)[0]

    # Encode demographics
    pred_gender_encoded = gender_encoder.transform([pred_gender])[0]
    pred_age_encoded = age_encoder.transform([pred_age])[0]

    # Combine features
    features = audio_features.copy()
    features['pred_gender'] = pred_gender_encoded
    features['pred_age'] = pred_age_encoded

    # Make final prediction
    return final_model.predict(features)[0]