In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import MiniBatchNMF
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# configurations
FILE_PATH = 'song_lyrics.csv' 
CHUNK_SIZE = 20000       
TRAIN_LIMIT = 500000      
N_TOPICS = 100             
SAMPLES_PER_GENRE = 10000  

# preprocessing function
def fast_clean(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'[^a-z\s]', '', text.lower())
    text = re.sub(r'\b(ooh|yeah|ah|verse|chorus|intro|outro)\b', '', text)
    return text

In [3]:
# data processing and feature extraction
def process_big_data():
    nmf_model = MiniBatchNMF(n_components=N_TOPICS, random_state=42)
    tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
    
    X_list = []
    y_list = []
    total_processed = 0
    
    chunk_iterator = pd.read_csv(FILE_PATH, chunksize=CHUNK_SIZE)
    try:
        first_chunk = next(chunk_iterator)
        first_chunk = first_chunk[first_chunk['language'] == 'en'].dropna(subset=['lyrics', 'tag'])
        first_chunk['clean_lyrics'] = first_chunk['lyrics'].apply(fast_clean)
        tfidf_vectorizer.fit(first_chunk['clean_lyrics'])
    except StopIteration:
        print("error dataset is empty")
        return np.array([]), np.array([])
    
    chunk_iterator = pd.read_csv(FILE_PATH, chunksize=CHUNK_SIZE)  
    for i, chunk in enumerate(chunk_iterator):
        if total_processed >= TRAIN_LIMIT:
            break
            
        if 'language' in chunk.columns:
            chunk = chunk[chunk['language'] == 'en']
        
        chunk = chunk.dropna(subset=['lyrics', 'tag'])
        
        if chunk.empty: continue
        chunk['clean_lyrics'] = chunk['lyrics'].apply(fast_clean)
        tfidf_matrix = tfidf_vectorizer.transform(chunk['clean_lyrics'])
        nmf_model.partial_fit(tfidf_matrix)
        topic_features = nmf_model.transform(tfidf_matrix)

        X_list.append(topic_features)
        y_list.extend(chunk['tag'].tolist())
        
        total_processed += len(chunk)

    if not X_list:
        return np.array([]), np.array([])
        
    X = np.vstack(X_list)
    y = np.array(y_list)
    
    return X, y

In [4]:
def balance_dataset(X, y, samples_per_class):
    unique_classes = np.unique(y)
    indices_to_keep = []
    
    for genre in unique_classes:
        genre_indices = np.where(y == genre)[0]
        count = len(genre_indices)
        
        if count >= samples_per_class:
            selected = np.random.choice(genre_indices, samples_per_class, replace=False)
        else:
            selected = genre_indices
            
        indices_to_keep.extend(selected)

    indices_to_keep = np.array(indices_to_keep)
    np.random.shuffle(indices_to_keep)
    
    return X[indices_to_keep], y[indices_to_keep]

In [None]:
if __name__ == "__main__":
    X, y = process_big_data()
    
    if len(X) == 0:
        print("no data to process")
    else:
        X, y = balance_dataset(X, y, samples_per_class=SAMPLES_PER_GENRE)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        models = {
            "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
            "SVM (Linear)": LinearSVC(dual=False, max_iter=1000),
            "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=50, n_jobs=-1) 
        }
        
        results = {}
        
        for name, model in models.items():
            print(f"\ntraining {name}")
            try:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
            
                precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)
                
                print(f"--> {name} Results:")
                print(f"    Accuracy:  {acc:.4f}")
                print(f"    Precision: {precision:.4f}")
                print(f"    Recall:    {recall:.4f}")
                print(f"    F1-Score:  {f1:.4f}")
                
                full_report = classification_report(y_test, y_pred, zero_division=0)
                results[name] = {"accuracy": acc, "f1": f1, "y_pred": y_pred, "report": full_report}
                
            except Exception as e:
                print(f"error training {name}: {e}")

        if results:
            best_model_name = max(results, key=lambda x: results[x]['f1'])
            
            print(f"WINNER: {best_model_name}")
            print(f"Accuracy: {results[best_model_name]['accuracy']:.2%}")
            print(f"F1-Score: {results[best_model_name]['f1']:.4f}")
            
            print("\nDetailed Per-Genre Breakdown:")
            print(results[best_model_name]['report'])

            # confusion matrix plot
            try:
                top_genres = pd.Series(y_test).value_counts().index 
                best_preds = results[best_model_name]['y_pred']
                
                cm = confusion_matrix(y_test, best_preds, labels=top_genres)
                
                plt.figure(figsize=(12,10))
                sns.heatmap(cm, annot=True, fmt='d', xticklabels=top_genres, yticklabels=top_genres, cmap='Blues')
                plt.title(f"Confusion Matrix (Balanced to {SAMPLES_PER_GENRE}/genre): {best_model_name}")
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.show()
            except Exception as e:
                print(f"Could not plot matrix: {e}")