In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv("/content/dataset.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [7]:
df.track_genre.value_counts()

Unnamed: 0_level_0,count
track_genre,Unnamed: 1_level_1
acoustic,1000
afrobeat,1000
alt-rock,1000
alternative,1000
ambient,1000
...,...
techno,1000
trance,1000
trip-hop,1000
turkish,1000


In [8]:
y = (df['popularity'] > 0).astype(int)

In [9]:
X = df.select_dtypes(include=['number', 'bool'])

X.loc[:, 'explicit'] = X['explicit'].astype(int)

# droping populartity as we want to predict it
X = X.drop(columns=['popularity'])

  X.loc[:, 'explicit'] = X['explicit'].astype(int)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()

In [12]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Model 1: Logistic Regression
logis_model = LogisticRegression(random_state=42, max_iter=1000)
logis_model.fit(X_train_scaled, y_train)
y_pred_logis = logis_model.predict(X_test_scaled)
acc_logis = accuracy_score(y_test, y_pred_logis)
print(f"Logistic Regression Accuracy: {acc_logis}")


Logistic Regression Accuracy: 0.8590350877192983


tried svc first but dataset is so huge taking more time
after searching for solution used linearsvc as it faster than that


In [14]:
from sklearn.svm import LinearSVC

In [15]:
# Model 2: Linear SVM (much faster)
lnr_svm_model = LinearSVC(random_state=42, max_iter=2000)

lnr_svm_model.fit(X_train_scaled, y_train)
y_pred_svm = lnr_svm_model.predict(X_test_scaled)

acc_svm = accuracy_score(y_test, y_pred_svm)
print(f"Linear SVM Accuracy: {acc_svm}")

Linear SVM Accuracy: 0.8590350877192983


In [17]:
# Model 3: Random Forest
rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {acc_rf}")

Random Forest Accuracy: 0.9329824561403509


In [20]:
print(f"Logistic Regression: {acc_logis:.2%}")
print(f"Linear SVM:          {acc_svm:.2%}")
print(f"Random Forest:       {acc_rf:.2%}")

Logistic Regression: 85.90%
Linear SVM:          85.90%
Random Forest:       93.30%


In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np # You'll need this for .mean() and .std()

In [22]:
rf_model_cv = RandomForestClassifier(random_state=42)

In [23]:
# cross-validation
scores = cross_val_score(rf_model_cv, X, y, cv=5, scoring='accuracy', n_jobs=-1)

print("Cross-validation finished.")

print(f"\nScores for each of the 5 folds: {scores}")
print(f"\nAverage Accuracy: {scores.mean():.2%}")
print(f"Standard Deviation: {scores.std():.4f}")

Cross-validation finished.

Scores for each of the 5 folds: [0.35166667 0.89276316 0.87785088 0.87644737 0.14298246]

Average Accuracy: 62.83%
Standard Deviation: 0.3181


In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf_model_cv = RandomForestClassifier(random_state=42)

3. THE FIX: Create a Shuffled K-Fold
 This forces it to shuffle the data BEFORE splitting

In [38]:
from sklearn.model_selection import KFold

In [39]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(rf_model_cv, X, y, cv=kfold, scoring='accuracy', n_jobs=-1)

print("Cross-validation done")

print(f"\nScores for each of the 5 folds: {scores}")
print(f"\nAverage Accuracy: {scores.mean():.2%}")
print(f"Standard Deviation: {scores.std():.4f}")

Cross-validation done

Scores for each of the 5 folds: [0.93364035 0.93280702 0.93078947 0.9329386  0.93364035]

Average Accuracy: 93.28%
Standard Deviation: 0.0010


In [27]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier

 Create a Small sample
 because GridSearchCV takes a long time.
use train_test_split to get a random 10,000-row .


In [28]:
_, X_sample, _, y_sample = train_test_split(X, y, test_size=10000, random_state=42, stratify=y)

print(f"GridSearchCV on a sample:- {X_sample.shape[0]} rows")

GridSearchCV on a sample:- 10000 rows


In [29]:
rf = RandomForestClassifier(random_state=42)

In [30]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 30, None],
    'max_features': ['sqrt', 'log2']
}

 --- 3. Set up the Shuffled Cross-Validation ---
 We use the same shuffled k-fold as before

In [31]:

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

In [32]:

grid_search.fit(X_sample, y_sample)

print("Grid Search Complete")
print(f"Best Accuracy Score: {grid_search.best_score_:.2%}")
print("Best Parameters Found:")
print(grid_search.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Grid Search Complete
Best Accuracy Score: 88.94%
Best Parameters Found:
{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}


In [33]:
grid_search.best_score_

np.float64(0.8894)

In [34]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier

Running RandomizedSearchCV on the FULL dataset

In [35]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, 40, None],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4]
}

Set up the Shuffled Cross-Validation
 use 3 folds for speed

In [36]:
kfold = KFold(n_splits=3, shuffle=True, random_state=42)


random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=5,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [37]:
random_search.fit(X, y)
print("Random Search Complete")
print(f"Best Accuracy Score: {random_search.best_score_:.2%}")
print("Best Parameters Found:")
print(random_search.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Random Search Complete
Best Accuracy Score: 92.04%
Best Parameters Found:
{'n_estimators': 400, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30}
