In [36]:
!pip install mamba-ssm



In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import random
import statistics

from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from scipy import stats
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from mamba_ssm import Mamba

In [38]:
### Define Global Variables ###
global object_cols
object_cols = ['artist_name', 'track_id', 'track_name', 'key_notes', 'pop_cat']

global numeric_cols
numeric_cols = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
                'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
                'popularity', 'pop_frac', 'pop_bin']

global categorical_cols
categorical_cols = ['key', 'mode', 'time_signature']

global numeric_non_cat
numeric_non_cat = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness',
                   'loudness', 'speechiness', 'tempo', 'valence',
                   'popularity', 'pop_frac', 'pop_bin']

global cols_to_stardardize
cols_to_standardize = ['duration_ms', 'loudness', 'tempo']

In [39]:
### ALL FUNCTIONS DEFINED HERE ####
# load in main database of songs and attributes
def load_data():
    df = pd.read_csv(Dataset_path)
    return df

# set some display options so easier to view all columns at once
def set_view_options(max_cols=50, max_rows=50, max_colwidth=9, dis_width=250):
    pd.options.display.max_columns = max_cols
    pd.options.display.max_rows = max_rows
    pd.set_option('max_colwidth', max_colwidth)
    pd.options.display.width = dis_width

# allows for easier visualization of all columns at once in the terminal
def rename_columns(df):
    df.columns = ['artist', 'trk_id', 'trk_name', 'acous', 'dance', 'ms',
                  'energy', 'instr', 'key', 'live', 'loud', 'mode', 'speech',
                  'tempo', 't_sig', 'val', 'popularity']
    return df

def get_df_info(df):
    # take an initial look at our data
    print(df.head())

    # take a look at the columns in our data set
    print("The columns are:")
    print(df.columns)

    # look at data types for each
    print(df.info())

    # take a look at data types, and it looks like we have a pretty clean data set!
    # However, I think the 0 popularity scores might throw the model(s) off a bit.
    print("Do we have any nulls?")
    print(f"Looks like we have {df.isnull().sum().sum()} nulls")

    # Lets take a look at the average popularity score
    pop_mean = df['popularity'].mean()
    print(pop_mean)

    # Proportion of songs that are very popular
    print(df[df['popularity'] >= 50 ]['popularity'].count() / df.shape[0])

    # Unique artists and song counts by artist
    print(df['artist_name'].unique().shape)
    print(df['artist_name'].value_counts())

# nice way to truncate the column names to display easier
# can be used with various metrics
def describe_cols(df, L=10):
    '''Limit ENTIRE column width (including header)'''
    # get the max col width
    O = pd.get_option("display.max_colwidth")
    # set max col width to be L
    pd.set_option("display.max_colwidth", L)
    print(df.rename(columns=lambda x: x[:L - 2] + '...' if len(x) > L else x).describe())
    pd.set_option("display.max_colwidth", O)

# How many songs have a popularity score > 90??
# Let's list these songs
def most_popular_songs(df):
    most_popular = df[df['popularity'] > 90]['popularity'].count()
    print(df[df['popularity'] > 90][['artist_name', 'popularity']])

# plot a scatter plot
def scatter_plot(df, col_x, col_y):
    plt.scatter(df[col_x], df[col_y], alpha=0.2)
    plt.show()

def plot_scatter_matrix(df, num_rows):
    scatter_matrix(df[:num_rows], alpha=0.2, figsize=(6, 6), diagonal='kde')
    plt.show()

def calc_correlations(df, cutoff):
    corr = df.corr()
    print(corr[corr > cutoff])

# get redundant pairs from DataFrame
def get_redundant_pairs(df):
    '''Get diagonal pairs of correlation matrix and all pairs we'll remove
    (since pair each is doubled in corr matrix)'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            if df[cols[i]].dtype != 'object' and df[cols[j]].dtype != 'object':
                # print("THIS IS NOT AN OBJECT, YO, so you CAN take a corr of it, smarty!")
                pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

# get top absolute correlations
def get_top_abs_correlations(df, n=10):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)

    print("The top absolute correlations are:")
    print(au_corr[0:n])
    return au_corr[0:n]

# initial linear regression function, and plots
def linear_regression_initial(df):
    df = df.copy()

    X_cols = ['acousticness', 'danceability', 'duration_ms', 'energy',
          'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
          'speechiness', 'tempo', 'time_signature', 'valence']

    y_col = ['popularity']

    X = df[X_cols]
    y = df[y_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    X_train = sm.add_constant(X_train)

    # Instantiate OLS model, fit, predict, get errors
    model = sm.OLS(y_train, X_train)
    results = model.fit()
    fitted_vals = results.predict(X_train)
    stu_resid = results.resid_pearson
    residuals = results.resid
    y_vals = pd.DataFrame({'residuals':residuals, 'fitted_vals':fitted_vals, \
                           'stu_resid': stu_resid})

    # Print the results
    print(results.summary())

    # QQ Plot
    fig, ax = plt.subplots(figsize=(8, 5))
    plt.title("QQ Plot - Initial Linear Regression")
    fig = sm.qqplot(stu_resid, line='45', fit=True, ax=ax)
    plt.show()

    # Residuals Plot
    y_vals.plot(kind='scatter', x='fitted_vals', y='stu_resid')
    plt.show()

# print count of all zeros within the dataset
def get_zeros(df):
    print(df[df['popularity'] == 0 ]['popularity'].count())

# plot polularity scores distribution
def plot_pop_dist(df):
    # set palette
    sns.set_palette('muted')

    # create initial figure
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_subplot(111)
    sns.distplot(df['popularity']/100, color='g', label="Popularity").set_title("Distribution of Popularity Scores - Entire Data Set")

    # create x and y axis labels
    plt.xlabel("Popularity")
    plt.ylabel("Density")

    plt.show()

# plot undersampling methodology
def undersample_plot(df):
    # set palette
    sns.set_palette('muted')

    # create initial figure
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_subplot(111)
    sns.distplot(df['popularity']/100, color='g', label="Popularity").set_title("Illustration of Undersampling from Data Set")

    # create line to shade to the right of
    line = ax.get_lines()[-1]
    x_line, y_line = line.get_data()
    mask = x_line > 0.55
    x_line, y_line = x_line[mask], y_line[mask]
    ax.fill_between(x_line, y1=y_line, alpha=0.5, facecolor='red')

    # get values for and plot first label
    label_x = 0.5
    label_y = 4
    arrow_x = 0.6
    arrow_y = 0.2

    arrow_properties = dict(
        facecolor="black", width=2,
        headwidth=4,connectionstyle='arc3,rad=0')

    plt.annotate(
        "First, sample all songs in this range.\n Sample size is n. Cutoff is 0.5.", xy=(arrow_x, arrow_y),
        xytext=(label_x, label_y),
        bbox=dict(boxstyle='round,pad=0.5', fc='red', alpha=0.5),
        arrowprops=arrow_properties)

    # Get values for and plot second label
    label_x = 0.1
    label_y = 3
    arrow_x = 0.2
    arrow_y = 0.2

    arrow_properties = dict(
        facecolor="black", width=2,
        headwidth=4,connectionstyle='arc3,rad=0')

    plt.annotate(
        "Next, randomly sample \n n songs in this range", xy=(arrow_x, arrow_y),
        xytext=(label_x, label_y),
        bbox=dict(boxstyle='round,pad=0.5', fc='g', alpha=0.5),
        arrowprops=arrow_properties)

    # plot final word box
    plt.annotate(
        "Therefore, end up with a 50/50 \n split of Popular / Not Popular\n songs", xy=(0.6, 2),
        xytext=(0.62, 2),
        bbox=dict(boxstyle='round,pad=0.5', fc='b', alpha=0.5))

    # create x and y axis labels
    plt.xlabel("Popularity")
    plt.ylabel("Density")

    plt.show()

# calculate and print more stats from the df
def get_stats(df):
    # print stats for various metrics
    print(f"There are {df.shape[0]} rows")
    print(f"There are {df['track_id'].unique().shape} unique songs")
    print(f"There are {df['artist_name'].unique().shape} unique artists")
    print(f"There are {df['popularity'].unique().shape} popularity scores")
    print(f"The mean popularity score is {df['popularity'].mean()}")
    print(f"There are {df[df['popularity'] > 55]['popularity'].count()} songs with a popularity score > 55")
    print(f"There are {df[df['popularity'] > 75]['popularity'].count()} songs with a popularity score > 75")
    print(f"Only {(df[df['popularity'] > 80]['popularity'].count() / df.shape[0])*100:.2f} % of songs have a popularity score > 80")

# plot univariate dists for several independent variables
def plot_univ_dists(df, cutoff):
    popularity_cutoff = cutoff
    print('Mean value for Danceability feature for Popular songs: {}'.format(df[df['popularity'] > popularity_cutoff]['danceability'].mean()))
    print('Mean value for Danceability feature for Unpopular songs: {}'.format(df[df['popularity'] < popularity_cutoff]['danceability'].mean()))

    fig, ax = plt.subplots(1, 1, figsize=(8,5))
    fig.suptitle('Histograms and Univariate Distributions of Important Features')
    sns.distplot(df[df['popularity'] < popularity_cutoff]['danceability'])
    sns.distplot(df[df['popularity'] > popularity_cutoff]['danceability'])
    plt.show()

    fig, ax = plt.subplots(1, 1, figsize=(8,5))
    sns.distplot(df[df['popularity'] < popularity_cutoff]['valence'])
    sns.distplot(df[df['popularity'] > popularity_cutoff]['valence'])
    plt.show()

    fig, ax = plt.subplots(1, 1, figsize=(8,5))
    sns.distplot(df[df['popularity'] < popularity_cutoff]['acousticness'])
    sns.distplot(df[df['popularity'] > popularity_cutoff]['acousticness'])
    plt.show()

# plot violin plot for several independent variables
def plot_violin(df, cutoff):
    df = df.copy()

    sns.set(style="whitegrid")
    df['pop_bin'] = np.where(df['popularity'] > cutoff, "Popular", "Not_Popular")

    fig, ax = plt.subplots(1, 3, sharey=True, figsize=(12,4))
    fig.suptitle('Distributions of Selected Features at Popularity Score Cutoff of 55')

    sns.violinplot(x=df['pop_bin'], y=df['danceability'], ax=ax[0])
    sns.violinplot(x=df['pop_bin'], y=df['valence'], ax=ax[1])
    sns.violinplot(x=df['pop_bin'], y=df['acousticness'], ax=ax[2])

    plt.show()

    sns.set(style="whitegrid")

    fig, ax = plt.subplots(1, 3, sharey=True, figsize=(12,4))
    fig.suptitle('Distributions of Selected Features at Popularity Score Cutoff of 55')

    sns.violinplot(x=df['pop_bin'], y=df['energy'], ax=ax[0])
    sns.violinplot(x=df['pop_bin'], y=df['instrumentalness'], ax=ax[1])
    sns.violinplot(x=df['pop_bin'], y=df['liveness'], ax=ax[2])

    plt.show()

# plot pairplot for subsection of df rows and columns
def plot_pairplot(df, rows, cutoff):
    # not it looks MUCH better to run this function in jupyter
    df = df.copy()

    df['pop_bin'] = np.where(df['popularity'] > cutoff, "Popular", "Not_Popular")

    cols_for_pp = ['danceability', 'energy', 'instrumentalness',
       'loudness','valence', 'popularity', 'pop_bin']

    sns.pairplot(df.loc[:rows, cols_for_pp], hue='pop_bin', size=2)

    plt.show()

# plot the key counts for popular and unpopular songs
def plot_keys(df, cutoff):
    df_popular = df[df['popularity'] > cutoff].copy()

    fig, ax = plt.subplots(1, 1, sharey=True, figsize=(8,5))
    key_mapping = {0.0: 'C', 1.0: 'C♯,D♭', 2.0: 'D', 3.0: 'D♯,E♭', 4.0: 'E', 5.0:
                  'F', 6.0: 'F♯,G♭', 7.0: 'G', 8.0: 'G♯,A♭', 9.0: 'A', 10.0: 'A♯,B♭',
                  11.0: 'B'}

    df_popular['key_val'] = df_popular['key'].map(key_mapping)
    sns.countplot(x='key_val', data=df_popular, order=df_popular['key_val'].value_counts().index, palette='muted')
    plt.title("Key Totals for Popular Songs")
    plt.show()

    df_unpopular = df[df['popularity'] < 55].copy()
    fig, ax = plt.subplots(1, 1, sharey=True, figsize=(8,5))
    df_unpopular['key_val'] = df_unpopular['key'].map(key_mapping)
    sns.countplot(x='key_val', data=df_unpopular, order=df_unpopular['key_val'].value_counts().index, palette='muted')
    plt.title("Key Totals for Unpopular Songs")
    plt.show()

# plot a heatmap of the correlations between features as well as dependent variable
def plot_heatmap(df):
    # note this looks better in jupyter as well
    plt.figure(figsize = (16,6))
    sns.heatmap(df.corr(), cmap="coolwarm", annot=True, )
    plt.show()

# check that deltas in means are significant for selected dependent variables
def calc_ANOVA(df, cutoff):
    df_popular = df[df['popularity'] > cutoff].copy()
    df_unpopular = df[df['popularity'] < cutoff].copy()

    print("Popular and Unpopular Danceability Means:")
    print(df_popular['danceability'].mean())
    print(df_unpopular['danceability'].mean())
    f_val, p_val = stats.f_oneway(df_popular['danceability'], df_unpopular['danceability'])

    print("Danceability One-way ANOVA P ={}".format(p_val))

    print("Popular and Unpopular Loudness Means:")
    print(df_popular['loudness'].mean())
    print(df_unpopular['loudness'].mean())
    f_val, p_val = stats.f_oneway(df_popular['loudness'], df_unpopular['loudness'])

    print("Loudness One-way ANOVA P ={}".format(p_val))

    print(df_popular['valence'].mean())
    print(df_unpopular['valence'].mean())
    f_val, p_val = stats.f_oneway(df_popular['valence'], df_unpopular['valence'])

    print("Valence One-way ANOVA P ={}".format(p_val))

    print(df_popular['instrumentalness'].mean())
    print(df_unpopular['instrumentalness'].mean())
    f_val, p_val = stats.f_oneway(df_popular['instrumentalness'], df_unpopular['instrumentalness'])

    print("Instrumentalness One-way ANOVA P ={}".format(p_val))

# randomly sample data below cutoff after choosing a cutoff so have a 50/50 split
# of popular/unpopular target variable values.
def random_under_sampler(df, cutoff):
    df_original = df.copy()
    df_original['pop_bin'] = np.where(df_original['popularity'] > cutoff, "Popular", "Not_Popular")

    df_small = df_original[df_original['popularity'] > cutoff].copy()
    df_samples_added = df_small.copy()

    total = df_small.shape[0] + 1

    # loop through and add random unpopular rows to sampled df
    while total <= df_small.shape[0]*2:

        # pick a random from from the original dataframe
        rand_row = random.randint(0,df_original.shape[0])

        if df_original.loc[rand_row, 'pop_bin'] == "Not_Popular":
            df_samples_added.loc[total] = df_original.loc[rand_row, :]
            total +=1

    # print some stats on the undersampled df
    print("Size checks for new df:")
    print("Shape of new undersampled df: {}".format(df_samples_added.shape))
    print(df_samples_added['pop_bin'].value_counts())
    print(df_samples_added[df_samples_added['pop_bin'] == 'Popular']['danceability'].mean())
    print(df_samples_added[df_samples_added['pop_bin'] == 'Not_Popular']['danceability'].mean())
    print(df_samples_added[df_samples_added['pop_bin'] == 'Popular']['danceability'].count())
    print(df_samples_added[df_samples_added['pop_bin'] == 'Not_Popular']['danceability'].count())
    f_val, p_val = stats.f_oneway(df_samples_added[df_samples_added['pop_bin'] == 'Popular']['danceability'], df_samples_added[df_samples_added['pop_bin'] == 'Not_Popular']['danceability'])

    print("One-way ANOVA P ={}".format(p_val))

    # return the df
    return df_samples_added

# plot histograms of metrics for popular and unpopular songs
def plot_hist(sampled_df):
    sampled_df[sampled_df['pop_bin'] == "Popular"].hist(figsize=(8, 8))
    plt.show()

    sampled_df[sampled_df['pop_bin'] != "Popular"].hist(figsize=(8, 8))
    plt.show()

# return records that contain strings of artist and track names
def search_artist_track_name(df, artist, track):
    # this displays much better in jupyter
    print(df[(df['artist_name'].str.contains(artist)) & (df['track_name'].str.contains(track))])

    # use this if searching for A$AP rocky (or other artist with $ in the name)
    # df[(df['artist_name'].str.contains("A\$AP Rocky"))]

# add important columns to dataframe
def add_cols(df, cutoff=55):
    df = df.copy()

    # add key_notes mapping key num vals to notes
    key_mapping = {0.0: 'C', 1.0: 'C♯,D♭', 2.0: 'D', 3.0: 'D♯,E♭',
                   4.0: 'E', 5.0: 'F', 6.0: 'F♯,G♭', 7.0: 'G',
                   8.0: 'G♯,A♭', 9.0: 'A', 10.0: 'A♯,B♭', 11.0: 'B'}
    df['key_notes'] = df['key'].map(key_mapping)

    # add columns relating to popularity
    df['pop_frac'] = df['popularity'] / 100
    df['pop_cat'] = np.where(df['popularity'] > cutoff, "Popular", "Not_Popular")
    df['pop_bin'] = np.where(df['popularity'] > cutoff, 1, 0)

    return df

def return_X_y_logistic_more_cols(df):
    df = df.copy()

    # define columns to use for each
    X_cols = ['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability', 'duration_ms', 'energy',
              'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
              'speechiness', 'tempo', 'time_signature', 'valence']

    # use 1's and 0's for logistic
    y_col = ['pop_bin']

    # split into X and y
    X = df[X_cols]
    y = df[y_col]

    return X, y

def return_X_y_mamba_more_cols(df):
    df = df.copy()

    # define columns to use for each
    X_cols = ['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability', 'duration_ms', 'energy',
              'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
              'speechiness', 'tempo', 'time_signature', 'valence']

    y_col = ['popularity']

    # split into X and y
    X = df[X_cols]
    y = df[y_col]

    return X, y

# choose cutoff, sample popular data, randomly sample unpopular data, and combine the dfs
def split_sample_combine(df, cutoff=55, col='popularity', rand=None):
    # split out popular rows above the popularity cutoff
    split_pop_df = df[df[col] > cutoff].copy()

    # get the leftover rows, the 'unpopular' songs
    df_leftover = df[df[col] < cutoff].copy()

    # what % of the original data do we now have?
    ratio = split_pop_df.shape[0] / df.shape[0]

    # what % of leftover rows do we need?
    ratio_leftover = split_pop_df.shape[0] / df_leftover.shape[0]

    # get the exact # of unpopular rows needed, using a random sampler
    unpop_df_leftover, unpop_df_to_add = train_test_split(df_leftover, \
                                                          test_size=ratio_leftover, \
                                                          random_state=rand)

    # combine the dataframes to get total rows = split_pop_df * 2
    # ssc stands for "split_sample_combine"
    # Concatenate DataFrames using pandas.concat
    ssc_df = pd.concat([split_pop_df, unpop_df_to_add], ignore_index=True)

    # shuffle the df
    ssc_df = ssc_df.sample(frac=1, random_state=rand).reset_index(drop=True)

    # add key_notes mapping key num vals to notes
    key_mapping = {0.0: 'C', 1.0: 'C♯,D♭', 2.0: 'D', 3.0: 'D♯,E♭',
                   4.0: 'E', 5.0: 'F', 6.0: 'F♯,G♭', 7.0: 'G',
                   8.0: 'G♯,A♭', 9.0: 'A', 10.0: 'A♯,B♭', 11.0: 'B'}
    ssc_df['key_notes'] = ssc_df['key'].map(key_mapping)

    # add columns relating to popularity
    ssc_df['pop_frac'] = ssc_df['popularity'] / 100
    ssc_df['pop_cat'] = np.where(ssc_df['popularity'] > cutoff, "Popular", "Not_Popular")
    ssc_df['pop_bin'] = np.where(ssc_df['popularity'] > cutoff, 1, 0)

    return ssc_df

def standardize_X(X):
    X = X.copy()

    # standardize only columns not between 0 and 1
    for col in cols_to_standardize:
        new_col_name = col + "_std"
        X[new_col_name] = (X[col] - X[col].mean()) / X[col].std()

    X_cols = ['acousticness', 'danceability', 'duration_ms_std', 'energy',
              'instrumentalness', 'key', 'liveness', 'loudness_std', 'mode',
              'speechiness', 'tempo_std', 'time_signature', 'valence']

    # return the std columns in a dataframe
    X = X[X_cols]

    return X

def evaluation_metric(y_test,y_hat):
    MSE = mean_squared_error(y_test, y_hat)
    RMSE = MSE**0.5
    MAE = mean_absolute_error(y_test,y_hat)
    R2 = r2_score(y_test,y_hat)
    print('MSE: %.4f, RMSE: %.4f, MAE: %.4f, R2: %.4f' % (MSE, RMSE, MAE, R2))

In [40]:
def logistic_regression_final(df, plot_the_roc=True):
    df = df.copy()
    cutoff = 80

    X, y = return_X_y_logistic_more_cols(split_sample_combine(df, cutoff=cutoff, rand=2))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

    global df_train_results_log80
    global df_test_results_log80
    df_train_results_log80 = X_train.join(y_train)
    df_test_results_log80 = X_test.join(y_test)

    # standardize X_train and X_test
    X_train = standardize_X(X_train)
    X_test = standardize_X(X_test)

    X_train = X_train.values
    y_train = y_train.values.ravel()

    X_test = X_test.values
    y_test = y_test.values.ravel()

    global sanity_check
    sanity_check = X_test

    ## Run logistic regression on all the data
    classifier = LogisticRegression(max_iter = 1000)
    # note using .predict_proba() below, which is the probability of each class

    # predict values for X_train
    y_predict_train = classifier.fit(X_train, y_train).predict(X_train)
    probs_0and1_train = classifier.fit(X_train, y_train).predict_proba(X_train)
    y_prob_P_train = probs_0and1_train[:, 1]

    # predict values for X_test
    y_predict_test = classifier.fit(X_train, y_train).predict(X_test)
    probs_0and1_test = classifier.fit(X_train, y_train).predict_proba(X_test)  # yes!
    y_prob_P_test = probs_0and1_test[:, 1]

    # calculate metrics needed to use for ROC curve below
    fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, y_prob_P_train, pos_label=1)
    auc_train = metrics.roc_auc_score(y_train, y_prob_P_train)  # note we are scoring on our training data!

    fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_prob_P_test, pos_label=1)
    auc_test = metrics.roc_auc_score(y_test, y_prob_P_test)  # note we are scoring on our training data!

    # print some metrics
    print("Train accuracy: {:.2f}".format(accuracy_score(y_train, y_predict_train)))
    print("Test accuracy: {:.2f}".format(accuracy_score(y_test, y_predict_test)))

    print("Train recall: {:.2f}".format(recall_score(y_train, y_predict_train)))
    print("Test recall: {:.2f}".format(recall_score(y_test, y_predict_test)))

    print("Train precision: {:.2f}".format(precision_score(y_train, y_predict_train)))
    print("Test precision: {:.2f}".format(precision_score(y_test, y_predict_test)))

    print("Train auc: {:.2f}".format(auc_train))
    print("Test auc: {:.2f}".format(auc_test))

    global conf_matrix_log80_train
    global conf_matrix_log80_test
    conf_matrix_log80_train = confusion_matrix(y_train, y_predict_train)
    conf_matrix_log80_test = confusion_matrix(y_test, y_predict_test)

    global final_coefs
    global final_intercept
    final_coefs = classifier.fit(X_train, y_train).coef_
    final_intercept = classifier.fit(X_train, y_train).intercept_

    # Back of the envelope calcs to make sure metrics above are correct
    df_train_results_log80 = df_train_results_log80.reset_index(drop=True)
    df_train_results_log80['pop_predict'] = y_prob_P_train

    df_test_results_log80 = df_test_results_log80.reset_index(drop=True)
    df_test_results_log80['pop_predict'] = y_prob_P_test

    df_train_results_log80['pop_predict_bin'] = np.where(df_train_results_log80['pop_predict'] >= 0.5, 1, 0)
    df_test_results_log80['pop_predict_bin'] = np.where(df_test_results_log80['pop_predict'] >= 0.5, 1, 0)

    #print("Back of the envelope calc for Train Recall")
    #print(sum((df_train_results_log80['pop_predict_bin'].values * df_train_results_log80['pop_bin'].values)) /
          #df_train_results_log80['pop_bin'].sum())

    if plot_the_roc == True:
        # Plot the ROC
        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111)
        ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
                label='Luck')
        ax.plot(fpr_train, tpr_train, color='b', lw=2, label='Model_Train')
        ax.plot(fpr_test, tpr_test, color='r', lw=2, label='Model_Test')
        ax.set_xlabel("False Positive Rate", fontsize=20)
        ax.set_ylabel("True Positive Rate", fontsize=20)
        ax.set_title("ROC curve - Cutoff: " + str(cutoff), fontsize=24)
        ax.text(0.05, 0.95, " ".join(["AUC_train:", str(auc_train.round(3))]), fontsize=20)
        ax.text(0.32, 0.7, " ".join(["AUC_test:", str(auc_test.round(3))]), fontsize=20)
        ax.legend(fontsize=24)
        plt.show()

In [41]:
def preprocess_data(df):
    # Convert non-numeric columns to numeric types if possible
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            continue
        try:
            df[column] = pd.to_numeric(df[column])
        except ValueError:
            df.drop(column, axis=1, inplace=True)  # Drop non-numeric column if conversion fails
    return df

class MambaModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MambaModel, self).__init__()
        self.mamba = Mamba(
            d_model=input_dim,
            d_state=16,
            d_conv=4,
            expand=2
        )
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.mamba(x)
        x = self.linear(x)
        x = torch.sigmoid(x)
        return x.flatten()

def PredictWithData(X_train, y_train, X_test):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")                
    learning_rate = 0.01
    num_epochs = 1000
    
    model = MambaModel(input_dim = X_train.shape[1], output_dim = 1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.MSELoss() # For Regression
    
    X_tr = torch.from_numpy(X_train).float().unsqueeze(0).to(device)
    X_te = torch.from_numpy(X_test).float().unsqueeze(0).to(device)
    y_tr = torch.from_numpy(y_train).float().to(device)

    for e in range(1, num_epochs+1):
        model.train()
        permutation = torch.randperm(X_tr.size()[1])  # Data shuffling
        X_tr = X_tr[:,permutation,:]
        y_tr = y_tr[permutation]
        z = 100*model(X_tr) # For Regression
        loss = criterion(z, y_tr)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if e%100 == 0:
            print('Epoch %d | Lossp: %.4f' % (e, loss.item()))
            
    model.eval()
    mat = model(X_te)
    mat = mat.cpu()
    yhat = mat.detach().numpy().flatten()
    yhat = np.round(100*yhat)
    yhat = yhat.astype(int)
    return yhat

def ourModel(df):
    # Calculate the cutoff = median of popularity
    df1 = df.copy()
    df1 = preprocess_data(df1)
    labels = df1['popularity']
    cutoff = statistics.median(labels)
    print("cutoff:", cutoff)
    
    # Process and split the dataset
    df = df.copy()
    X, y = return_X_y_mamba_more_cols(split_sample_combine(df, cutoff=cutoff, rand=2))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)
    
    # Standardize X_train and X_test
    X_train = standardize_X(X_train)
    X_test = standardize_X(X_test)

    X_train = X_train.values
    y_train = y_train.values.ravel()

    X_test = X_test.values
    y_test = y_test.values.ravel()

    # Run inference
    y_pred = PredictWithData(X_train, y_train, X_test)
    
    # Convert predictions to binary
    y_pred_bin = (y_pred >= cutoff).astype(int)
    y_test_bin = (y_test >= cutoff).astype(int)
    
    # Compute metrics
    accuracy = accuracy_score(y_test_bin, y_pred_bin)
    recall = recall_score(y_test_bin, y_pred_bin)
    precision = precision_score(y_test_bin, y_pred_bin)
    
    # Print metrics
    print("Accuracy: {:.2f}".format(accuracy))
    print("Recall: {:.2f}".format(recall))
    print("Precision: {:.2f}".format(precision))
    
    return y_pred_bin

In [42]:
if __name__ == "__main__":
    # Load data
    Dataset_path = '/kaggle/input/spotifydataset/SpotifyDataset.csv'
    df = load_data()

    ourModel(df)
    #logistic_regression_final(df, plot_the_roc=False)

cutoff: 67
Epoch 100 | Lossp: 113.5432
Epoch 200 | Lossp: 109.2435
Epoch 300 | Lossp: 104.5987
Epoch 400 | Lossp: 102.5124
Epoch 500 | Lossp: 100.3509
Epoch 600 | Lossp: 96.8474
Epoch 700 | Lossp: 94.2266
Epoch 800 | Lossp: 90.9605
Epoch 900 | Lossp: 89.8394
Epoch 1000 | Lossp: 87.8450
Accuracy: 0.63
Recall: 0.62
Precision: 0.65
