In [12]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from scipy.sparse import hstack
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv("mus.csv")


In [3]:
numerical_features = [
    'len', 'dating', 'violence', 'world/life', 'night/time',
    'shake the audience', 'family/gospel', 'romantic',
    'communication', 'obscene', 'music', 'movement/places',
    'light/visual perceptions', 'family/spiritual', 'like/girls',
    'sadness', 'feelings', 'danceability', 'loudness',
    'acousticness', 'instrumentalness', 'valence', 'energy', 'age'
]


In [4]:
df = df[numerical_features + ['lyrics', 'genre']].dropna()

In [5]:
le = LabelEncoder()
y = le.fit_transform(df['genre'])


In [6]:
scaler = StandardScaler()
X_num = scaler.fit_transform(df[numerical_features])

In [7]:
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')  # ограничим фичи
X_text = tfidf.fit_transform(df['lyrics'])

In [8]:
from scipy.sparse import csr_matrix
X_combined = hstack([csr_matrix(X_num), X_text])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

       blues       0.43      0.35      0.39       921
     country       0.53      0.65      0.59      1089
     hip hop       0.83      0.56      0.67       181
        jazz       0.55      0.43      0.49       769
         pop       0.44      0.55      0.49      1408
      reggae       0.56      0.54      0.55       500
        rock       0.51      0.41      0.45       807

    accuracy                           0.50      5675
   macro avg       0.55      0.50      0.52      5675
weighted avg       0.51      0.50      0.50      5675



In [13]:
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(tfidf, 'tfidf.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(numerical_features, 'features.pkl')

['features.pkl']