In [47]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
df = pd.read_csv('label_data.csv', delimiter=';')

In [10]:
# 1. Explore the data
df.head()
df.info()
df.describe()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9161 entries, 0 to 9160
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Rank                              9161 non-null   int64  
 1   Title                             9161 non-null   object 
 2   Artists                           9161 non-null   object 
 3   Date                              9161 non-null   object 
 4   Danceability                      9161 non-null   float64
 5   Energy                            9161 non-null   float64
 6   Loudness                          9161 non-null   float64
 7   Speechiness                       9161 non-null   float64
 8   Acousticness                      9161 non-null   float64
 9   Instrumentalness                  9161 non-null   float64
 10  Valence                           9161 non-null   float64
 11  # of Artist                       9161 non-null   object 
 12  Artist

(9161, 22)

In [11]:
# 2. Check for missing values
df.isnull().sum()

# Handle missing values if any
# df.dropna(inplace=True)  # or
# df.fillna(method='ffill', inplace=True)

Rank                                0
Title                               0
Artists                             0
Date                                0
Danceability                        0
Energy                              0
Loudness                            0
Speechiness                         0
Acousticness                        0
Instrumentalness                    0
Valence                             0
# of Artist                         0
Artist (Ind.)                       0
# of Nationality                    0
Nationality                         0
Continent                           0
Points (Total)                      0
Points (Ind for each Artist/Nat)    0
id                                  0
Song URL                            0
Loudness_norm                       0
Popular                             0
dtype: int64

In [13]:
# Check column names
print(df.columns.tolist())

['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Nationality', 'Continent', 'Points (Total)', 'Points (Ind for each Artist/Nat)', 'id', 'Song URL', 'Loudness_norm', 'Popular']


In [15]:
# 4. Separate features (X) and target (y)
# Replace 'target_column' with your actual target column name
X = df.drop('Popular', axis=1)
y = df['Popular']

In [34]:
# 7. Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
# K-Fold Cross-Validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42, max_iter=1000)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f}")
print(f"Std CV Score: {cv_scores.std():.4f}")

Cross-Validation Scores: [0.73813421 0.73799127 0.73799127 0.73853712 0.73853712]
Mean CV Score: 0.7382
Std CV Score: 0.0002


In [36]:
# Check data types of all columns
print(df.dtypes)
print("\n")
print(df.head())

Rank                                  int64
Title                                object
Artists                              object
Date                                 object
Danceability                        float64
Energy                              float64
Loudness                            float64
Speechiness                         float64
Acousticness                        float64
Instrumentalness                    float64
Valence                             float64
# of Artist                          object
Artist (Ind.)                        object
# of Nationality                     object
Nationality                          object
Continent                            object
Points (Total)                        int64
Points (Ind for each Artist/Nat)    float64
id                                   object
Song URL                             object
Loudness_norm                       float64
Popular                               int64
dtype: object


   Rank         

In [37]:
# Identify and handle categorical columns
# Option 1: Drop non-numeric columns (like song names, artist names)
# These are usually identifiers and don't help with prediction

# Get only numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)

# Remove target from numeric columns if present
if 'Popular' in numeric_cols:
    numeric_cols.remove('Popular')

X = df[numeric_cols]
y = df['Popular']

Numeric columns: ['Rank', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', 'Points (Total)', 'Points (Ind for each Artist/Nat)', 'Loudness_norm', 'Popular']


In [38]:
# Select specific features
feature_cols = ['Danceability', 'Energy', 'Loudness_norm', 'Speechiness', 
                'Acousticness', 'Instrumentalness', 'Valence']

# Check if all features exist in the dataframe
print("Available columns:", df.columns.tolist())
print("\nRequested features:", feature_cols)

# Verify which features are present
missing_cols = [col for col in feature_cols if col not in df.columns]
if missing_cols:
    print(f"\nWarning: These columns are missing: {missing_cols}")
else:
    print("\nAll requested features are available!")

# Create X with only the selected features
X = df[feature_cols]
y = df['Popular']

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

Available columns: ['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Nationality', 'Continent', 'Points (Total)', 'Points (Ind for each Artist/Nat)', 'id', 'Song URL', 'Loudness_norm', 'Popular']

Requested features: ['Danceability', 'Energy', 'Loudness_norm', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']

All requested features are available!

X shape: (9161, 7)
y shape: (9161,)


In [39]:
# Now proceed with K-Fold Cross-Validation using selected features
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LogisticRegression(random_state=42, max_iter=1000)
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f}")
print(f"Std CV Score: {cv_scores.std():.4f}")

Cross-Validation Scores: [0.73813421 0.73799127 0.73799127 0.73853712 0.73853712]
Mean CV Score: 0.7382
Std CV Score: 0.0002


In [41]:
# Then train final model on full training set and evaluate on test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")


Test Accuracy: 0.7352

Classification Report:
              precision    recall  f1-score   support

           0       0.74      1.00      0.85      2021
           1       0.00      0.00      0.00       728

    accuracy                           0.74      2749
   macro avg       0.37      0.50      0.42      2749
weighted avg       0.54      0.74      0.62      2749



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [42]:
# 1. Check class distribution
print("Class distribution:")
print(y.value_counts())
print("\nClass proportions:")
print(y.value_counts(normalize=True))

Class distribution:
Popular
0    6763
1    2398
Name: count, dtype: int64

Class proportions:
Popular
0    0.738238
1    0.261762
Name: proportion, dtype: float64


In [44]:
# 2. Handle class imbalance - Use class_weight parameter
from sklearn.linear_model import LogisticRegression

# Automatically balance classes
model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")


Test Accuracy: 0.5504

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.55      0.64      1686
           1       0.30      0.55      0.39       605

    accuracy                           0.55      2291
   macro avg       0.54      0.55      0.52      2291
weighted avg       0.65      0.55      0.58      2291



In [48]:
# 4. Try different models that handle imbalance better
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Random Forest with balanced class weights
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Results:")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred_rf)}")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Results:
Test Accuracy: 0.7460

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.96      0.85      1686
           1       0.57      0.16      0.25       605

    accuracy                           0.75      2291
   macro avg       0.66      0.56      0.55      2291
weighted avg       0.71      0.75      0.69      2291

[[1611   75]
 [ 507   98]]


In [33]:
# 5. Adjust decision threshold
from sklearn.metrics import roc_curve

# Get probability predictions
y_proba = model.predict_proba(X_test)[:, 1]

# Try different threshold (default is 0.5)
threshold = 0.3  # Lower threshold to predict more class 1
y_pred_adjusted = (y_proba >= threshold).astype(int)

print(f"\nWith threshold {threshold}:")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_adjusted):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred_adjusted)}")


With threshold 0.3:
Test Accuracy: 0.2649

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      1686
           1       0.26      1.00      0.42       605

    accuracy                           0.26      2291
   macro avg       0.63      0.50      0.21      2291
weighted avg       0.81      0.26      0.11      2291



In [50]:
# Try XGBoost with scale_pos_weight
!pip install xgboost
from xgboost import XGBClassifier

# Calculate the ratio for class imbalance
class_ratio = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Class ratio (class 0 / class 1): {class_ratio:.2f}")

# Train XGBoost model
xgb_model = XGBClassifier(
    scale_pos_weight=class_ratio,  # Handle class imbalance
    random_state=42,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1
)

xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Results:")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred_xgb)}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred_xgb)}")

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.3 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.1
Class ratio (class 0 / class 1): 2.83

XGBoost Results:
Test Accuracy: 0.6032

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.65      0.71      1686
           1       0.33      0.48      0.39       605

    accuracy                           0.60      2291
   macro avg       0.55      0.57      0.55      2291
weighted avg       0.66      0.60      0.62      2291


Confusion Matrix:
[[1089  597]
 [ 312  293]]
