In [1990]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer

# Upload Data
Downloaded Spotify Data from Kaggle to a CSV of 520 entries
. Then, uploaded CSV

In [1991]:
file_path = Path("../Spotify/Resources/Spotify_data.csv")  # Update path if needed
df_spotify = pd.read_csv(file_path)

In [1992]:
# Review the DataFrame
print("First 5 rows of the dataset:")
df_spotify.head(10)

First 5 rows of the dataset:


Unnamed: 0,Age,Gender,spotify_usage_period,spotify_listening_device,spotify_subscription_plan,premium_sub_willingness,preffered_premium_plan,preferred_listening_content,fav_music_genre,music_time_slot,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating,pod_lis_frequency,fav_pod_genre,preffered_pod_format,pod_host_preference,preffered_pod_duration,pod_variety_satisfaction
0,20-35,Female,More than 2 years,Smart speakers or voice assistants,Free (ad-supported),Yes,Family Plan-Rs 179/month,Podcast,Melody,Night,Sadness or melancholy,leisure time,Playlists,3,Daily,Comedy,Interview,Both,Both,Ok
1,12-20,Male,More than 2 years,Computer or laptop,Free (ad-supported),Yes,Individual Plan- Rs 119/ month,Podcast,Rap,Afternoon,Social gatherings or parties,Workout session,Playlists,2,Several times a week,Comedy,Interview,Both,,Satisfied
2,35-60,Others,6 months to 1 year,Smart speakers or voice assistants,Free (ad-supported),Yes,Student Plan-Rs 59/month,Podcast,Pop,Night,Relaxation and stress relief,"Study Hours, While Traveling",Playlists,4,Once a week,Sports,Interview,,Both,Satisfied
3,20-35,Female,1 year to 2 years,"Smartphone, Smart speakers or voice assistants",Free (ad-supported),No,,Music,Melody,Night,"Relaxation and stress relief, Social gathering...","Office hours, Workout session, leisure time","recommendations, Playlists",4,Never,,,,,Ok
4,20-35,Female,1 year to 2 years,Smartphone,Free (ad-supported),No,,Music,Melody,Night,Relaxation and stress relief,leisure time,"recommendations, Playlists",4,Rarely,Lifestyle and Health,Story telling,Well known individuals,Both,Ok
5,20-35,Male,More than 2 years,Smartphone,Free (ad-supported),No,,Music,Pop,Night,Uplifting and motivational,Workout session,Others,3,Never,,,,,Ok
6,20-35,Female,1 year to 2 years,Smartphone,Free (ad-supported),No,,Music,Melody,Morning,Relaxation and stress relief,Office hours,recommendations,3,Never,,,,,Ok
7,20-35,Female,Less than 6 months,Smartphone,Free (ad-supported),No,,Music,Melody,Night,Social gatherings or parties,leisure time,recommendations,5,Several times a week,Lifestyle and Health,Conversational,Both,Longer,Satisfied
8,20-35,Female,Less than 6 months,Smartphone,Free (ad-supported),Yes,Individual Plan- Rs 119/ month,Music,Melody,Afternoon,"Relaxation and stress relief, Uplifting and mo...","While Traveling, leisure time","Playlists, Radio",4,Rarely,Comedy,Story telling,Well known individuals,Shorter,Satisfied
9,20-35,Female,More than 2 years,Smartphone,Free (ad-supported),No,,Music,Melody,Night,Relaxation and stress relief,"Office hours, While Traveling, Workout session",recommendations,4,Never,,,,,Ok


## ETL
### Transforming Data from Qualitative to Quantitative

### STEP 1: Remove Unrelavant Columns

In [1993]:
# Removed the following Columns that are irrelevant to the study or that go into specifics of podcasts: preffered_premium_plan, music_expl_method,"pod_lis_frequency","fav_pod_genre",
# "preffered_pod_format","pod_host_preference","preffered_pod_duration","pod_variety_satisfaction"
columns = [
    "Age",
    "Gender",
    "spotify_usage_period",
    "spotify_listening_device",
    "spotify_subscription_plan",
    "premium_sub_willingness",
    "preferred_listening_content",
    "fav_music_genre",
    "music_time_slot",
    "music_Influencial_mood",
    "music_lis_frequency",
    "music_expl_method",
    "music_recc_rating",
]
relevant_df = df_spotify[columns]
relevant_df 

Unnamed: 0,Age,Gender,spotify_usage_period,spotify_listening_device,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,fav_music_genre,music_time_slot,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating
0,20-35,Female,More than 2 years,Smart speakers or voice assistants,Free (ad-supported),Yes,Podcast,Melody,Night,Sadness or melancholy,leisure time,Playlists,3
1,12-20,Male,More than 2 years,Computer or laptop,Free (ad-supported),Yes,Podcast,Rap,Afternoon,Social gatherings or parties,Workout session,Playlists,2
2,35-60,Others,6 months to 1 year,Smart speakers or voice assistants,Free (ad-supported),Yes,Podcast,Pop,Night,Relaxation and stress relief,"Study Hours, While Traveling",Playlists,4
3,20-35,Female,1 year to 2 years,"Smartphone, Smart speakers or voice assistants",Free (ad-supported),No,Music,Melody,Night,"Relaxation and stress relief, Social gathering...","Office hours, Workout session, leisure time","recommendations, Playlists",4
4,20-35,Female,1 year to 2 years,Smartphone,Free (ad-supported),No,Music,Melody,Night,Relaxation and stress relief,leisure time,"recommendations, Playlists",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,20-35,Female,More than 2 years,"Smartphone, Computer or laptop, Wearable devices",Free (ad-supported),Yes,Music,Pop,Morning,"Uplifting and motivational, Sadness or melancholy",While Traveling,recommendations,5
516,20-35,Female,More than 2 years,"Smartphone, Computer or laptop",Free (ad-supported),Yes,Music,Pop,Night,"Relaxation and stress relief, Uplifting and mo...","While Traveling, Workout session, leisure time",Others,2
517,20-35,Female,More than 2 years,Smartphone,Free (ad-supported),No,Music,Pop,Night,Relaxation and stress relief,leisure time,recommendations,3
518,20-35,Female,6 months to 1 year,Smartphone,Premium (paid subscription),Yes,Music,Melody,Night,"Relaxation and stress relief, Uplifting and mo...","Workout session, leisure time","recommendations, Others",2


### STEP 2: Create a Scale for the spotify_usage_period column

In [1994]:
# Cleaning Favorite Music Genre Column
# Get the unique values of fav_music_genre
unique_periods = relevant_df['spotify_usage_period'].unique()
unique_periods

array(['More than 2 years', '6 months to 1 year', '1 year to 2 years',
       'Less than 6 months'], dtype=object)

In [1995]:
# Create a Scale for the spotify_usage_period column
relevant_df['spotify_usage_period'] = pd.Categorical(

relevant_df['spotify_usage_period'],
    categories=['Less than 6 months', '6 months to 1 year', '1 year to 2 years','More than 2 years' ],  # Specify order
    ordered=True
).codes
relevant_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_df['spotify_usage_period'] = pd.Categorical(


Unnamed: 0,Age,Gender,spotify_usage_period,spotify_listening_device,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,fav_music_genre,music_time_slot,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating
0,20-35,Female,3,Smart speakers or voice assistants,Free (ad-supported),Yes,Podcast,Melody,Night,Sadness or melancholy,leisure time,Playlists,3
1,12-20,Male,3,Computer or laptop,Free (ad-supported),Yes,Podcast,Rap,Afternoon,Social gatherings or parties,Workout session,Playlists,2
2,35-60,Others,1,Smart speakers or voice assistants,Free (ad-supported),Yes,Podcast,Pop,Night,Relaxation and stress relief,"Study Hours, While Traveling",Playlists,4
3,20-35,Female,2,"Smartphone, Smart speakers or voice assistants",Free (ad-supported),No,Music,Melody,Night,"Relaxation and stress relief, Social gathering...","Office hours, Workout session, leisure time","recommendations, Playlists",4
4,20-35,Female,2,Smartphone,Free (ad-supported),No,Music,Melody,Night,Relaxation and stress relief,leisure time,"recommendations, Playlists",4


### STEP 3: Tackle the Listening Device Column (Multiple Cell Entries)

In [1996]:
# Tackle the Listening Device Column
#Create lists out of the cell data to handle multiple cell data 
mlb = MultiLabelBinarizer()
device_data = relevant_df['spotify_listening_device'].str.split(', ')
encoded_device = pd.DataFrame(mlb.fit_transform(device_data), columns=mlb.classes_)
index=relevant_df.index         # Keep original DataFrame index
device_data.head()

# Merge back into the DataFrame:
relevant_df = pd.concat([relevant_df, encoded_device], axis=1)
relevant_df.head(50) 

# Count the number of listening devices to create the device variety_column
relevant_df['number_of_devices'] = relevant_df['spotify_listening_device'].apply(
    lambda x: float(x.count(',') + 1)  # Count commas and add 1
)
#Delete the Original Spotify Listening Devie Column
relevant_df = relevant_df.drop('spotify_listening_device', axis=1)
# Verify the results
relevant_df.head(10)

Unnamed: 0,Age,Gender,spotify_usage_period,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,fav_music_genre,music_time_slot,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating,Computer or laptop,Smart speakers or voice assistants,Smartphone,Wearable devices,number_of_devices
0,20-35,Female,3,Free (ad-supported),Yes,Podcast,Melody,Night,Sadness or melancholy,leisure time,Playlists,3,0,1,0,0,1.0
1,12-20,Male,3,Free (ad-supported),Yes,Podcast,Rap,Afternoon,Social gatherings or parties,Workout session,Playlists,2,1,0,0,0,1.0
2,35-60,Others,1,Free (ad-supported),Yes,Podcast,Pop,Night,Relaxation and stress relief,"Study Hours, While Traveling",Playlists,4,0,1,0,0,1.0
3,20-35,Female,2,Free (ad-supported),No,Music,Melody,Night,"Relaxation and stress relief, Social gathering...","Office hours, Workout session, leisure time","recommendations, Playlists",4,0,1,1,0,2.0
4,20-35,Female,2,Free (ad-supported),No,Music,Melody,Night,Relaxation and stress relief,leisure time,"recommendations, Playlists",4,0,0,1,0,1.0
5,20-35,Male,3,Free (ad-supported),No,Music,Pop,Night,Uplifting and motivational,Workout session,Others,3,0,0,1,0,1.0
6,20-35,Female,2,Free (ad-supported),No,Music,Melody,Morning,Relaxation and stress relief,Office hours,recommendations,3,0,0,1,0,1.0
7,20-35,Female,0,Free (ad-supported),No,Music,Melody,Night,Social gatherings or parties,leisure time,recommendations,5,0,0,1,0,1.0
8,20-35,Female,0,Free (ad-supported),Yes,Music,Melody,Afternoon,"Relaxation and stress relief, Uplifting and mo...","While Traveling, leisure time","Playlists, Radio",4,0,0,1,0,1.0
9,20-35,Female,3,Free (ad-supported),No,Music,Melody,Night,Relaxation and stress relief,"Office hours, While Traveling, Workout session",recommendations,4,0,0,1,0,1.0


### STEP 4 Tackle the Favorite Music Genre Column

In [1997]:
# Cleaning Favorite Music Genre Column
# Get the unique values of fav_music_genre
unique_genres = relevant_df['fav_music_genre'].unique()
unique_genres

array(['Melody', 'Rap', 'Pop', 'Classical & melody, dance', 'classical',
       'Rock', 'Old songs', 'Electronic/Dance', 'All', 'Kpop',
       'trending songs random'], dtype=object)

In [1998]:
# Tackle the 'Classical and melody' entry
# Tackle the 'Electronic/Dance' entry
# Tackle the 'All' entry

# Define the replacement rules
def clean_music_genre(genre):
    if isinstance(genre, str):  # Only process string entries (skip NaN/missing values)
        genre = genre.replace(' & m', ', M')
        genre = genre.replace('/D', ', d')
        
        # Standardize 'classical' to 'Classical'
        genre = genre.replace('classical', 'Classical')
        
        # Case-insensitive replacement for 'all' or 'All'
        if 'all' in genre.lower():
            genre = 'Electronic, dance, Kpop, Melody, Old songs, Pop, Rap, Rock, Classical, trending songs random'
    
    return genre

# Apply the cleaning function to the column
relevant_df['fav_music_genre'] = relevant_df['fav_music_genre'].apply(clean_music_genre)

# Show results
unique_genres = relevant_df['fav_music_genre'].unique()
unique_genres


array(['Melody', 'Rap', 'Pop', 'Classical, Melody, dance', 'Classical',
       'Rock', 'Old songs', 'Electronic, dance',
       'Electronic, dance, Kpop, Melody, Old songs, Pop, Rap, Rock, Classical, trending songs random',
       'Kpop', 'trending songs random'], dtype=object)

In [1999]:
#Create lists out of the fav_music_genre data to handle multiple cell data 
mlb = MultiLabelBinarizer()
fav_music_data = relevant_df['fav_music_genre'].str.split(', ')
fav_music_data = pd.DataFrame(mlb.fit_transform(fav_music_data), columns=mlb.classes_)
index=relevant_df.index         # Keep original DataFrame index
fav_music_data.head()

# Merge back into the DataFrame:
relevant_df = pd.concat([relevant_df, fav_music_data], axis=1)
relevant_df.head(50) 

# Count the number of favorite music data to create the fav_music_variety_column
relevant_df['fav_music_genre'] = relevant_df['fav_music_genre'].apply(
    lambda x: float(x.count(',') + 1)  # Count commas and add 1
)
# rename the fav_music_genre column to music_genre_variety
relevant_df = relevant_df.rename(columns={'fav_music_genre': 'music_genre_variety'})
pd.set_option('display.max_columns', None)
# Verify the results by displaying all columns
relevant_df.columns.tolist()

['Age',
 'Gender',
 'spotify_usage_period',
 'spotify_subscription_plan',
 'premium_sub_willingness',
 'preferred_listening_content',
 'music_genre_variety',
 'music_time_slot',
 'music_Influencial_mood',
 'music_lis_frequency',
 'music_expl_method',
 'music_recc_rating',
 'Computer or laptop',
 'Smart speakers or voice assistants',
 'Smartphone',
 'Wearable devices',
 'number_of_devices',
 'Classical',
 'Electronic',
 'Kpop',
 'Melody',
 'Old songs',
 'Pop',
 'Rap',
 'Rock',
 'dance',
 'trending songs random']

In [2000]:
relevant_df.head(30)

Unnamed: 0,Age,Gender,spotify_usage_period,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,music_genre_variety,music_time_slot,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating,Computer or laptop,Smart speakers or voice assistants,Smartphone,Wearable devices,number_of_devices,Classical,Electronic,Kpop,Melody,Old songs,Pop,Rap,Rock,dance,trending songs random
0,20-35,Female,3,Free (ad-supported),Yes,Podcast,1.0,Night,Sadness or melancholy,leisure time,Playlists,3,0,1,0,0,1.0,0,0,0,1,0,0,0,0,0,0
1,12-20,Male,3,Free (ad-supported),Yes,Podcast,1.0,Afternoon,Social gatherings or parties,Workout session,Playlists,2,1,0,0,0,1.0,0,0,0,0,0,0,1,0,0,0
2,35-60,Others,1,Free (ad-supported),Yes,Podcast,1.0,Night,Relaxation and stress relief,"Study Hours, While Traveling",Playlists,4,0,1,0,0,1.0,0,0,0,0,0,1,0,0,0,0
3,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,"Relaxation and stress relief, Social gathering...","Office hours, Workout session, leisure time","recommendations, Playlists",4,0,1,1,0,2.0,0,0,0,1,0,0,0,0,0,0
4,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,Relaxation and stress relief,leisure time,"recommendations, Playlists",4,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0
5,20-35,Male,3,Free (ad-supported),No,Music,1.0,Night,Uplifting and motivational,Workout session,Others,3,0,0,1,0,1.0,0,0,0,0,0,1,0,0,0,0
6,20-35,Female,2,Free (ad-supported),No,Music,1.0,Morning,Relaxation and stress relief,Office hours,recommendations,3,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0
7,20-35,Female,0,Free (ad-supported),No,Music,1.0,Night,Social gatherings or parties,leisure time,recommendations,5,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0
8,20-35,Female,0,Free (ad-supported),Yes,Music,1.0,Afternoon,"Relaxation and stress relief, Uplifting and mo...","While Traveling, leisure time","Playlists, Radio",4,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0
9,20-35,Female,3,Free (ad-supported),No,Music,1.0,Night,Relaxation and stress relief,"Office hours, While Traveling, Workout session",recommendations,4,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0


### STEP 5 :  Enumerate the music_Influencial_mood column and create a variety column

In [2001]:
# Cleaning, checking for unique values of music_Influencial_mood	
unique_moods = relevant_df['music_Influencial_mood'].unique()
unique_moods

array(['Sadness or melancholy', 'Social gatherings or parties',
       'Relaxation and stress relief',
       'Relaxation and stress relief, Social gatherings or parties',
       'Uplifting and motivational',
       'Relaxation and stress relief, Uplifting and motivational',
       'Relaxation and stress relief, Uplifting and motivational, Sadness or melancholy, Social gatherings or parties',
       'Relaxation and stress relief, Sadness or melancholy',
       'Relaxation and stress relief, Uplifting and motivational, Social gatherings or parties',
       'Relaxation and stress relief, Uplifting and motivational, Sadness or melancholy',
       'Uplifting and motivational, Sadness or melancholy',
       'Relaxation and stress relief, Sadness or melancholy, Social gatherings or parties',
       'Sadness or melancholy, Social gatherings or parties',
       'Uplifting and motivational, Sadness or melancholy, Social gatherings or parties',
       'Uplifting and motivational, Social gatherin

In [2002]:
#Create lists out of the music_Influencial_mood' data to handle multiple cell data 
mlb = MultiLabelBinarizer()
fav_music_data = relevant_df['music_Influencial_mood'].str.split(', ')
fav_music_data = pd.DataFrame(mlb.fit_transform(fav_music_data), columns=mlb.classes_)
index=relevant_df.index         # Keep original DataFrame index
fav_music_data.head()

# Merge back into the DataFrame:
relevant_df = pd.concat([relevant_df, fav_music_data], axis=1)
relevant_df.head(50) 

# Count the number of influential data to create the music mood variety column
relevant_df['music_Influencial_mood'] = relevant_df['music_Influencial_mood'].apply(
    lambda x: float(x.count(',') + 1)  # Count commas and add 1
)
# rename the music_Influencial_mood'column to music_genre_variety
relevant_df = relevant_df.rename(columns={'music_Influencial_mood': 'music_mood_variety'})

#Display all the rows
pd.set_option('display.max_columns', None)

# Verify the results by displaying all columns
relevant_df 

Unnamed: 0,Age,Gender,spotify_usage_period,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,music_genre_variety,music_time_slot,music_mood_variety,music_lis_frequency,music_expl_method,music_recc_rating,Computer or laptop,Smart speakers or voice assistants,Smartphone,Wearable devices,number_of_devices,Classical,Electronic,Kpop,Melody,Old songs,Pop,Rap,Rock,dance,trending songs random,Relaxation and stress relief,Sadness or melancholy,Social gatherings or parties,Uplifting and motivational
0,20-35,Female,3,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,leisure time,Playlists,3,0,1,0,0,1.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,12-20,Male,3,Free (ad-supported),Yes,Podcast,1.0,Afternoon,1.0,Workout session,Playlists,2,1,0,0,0,1.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,35-60,Others,1,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,"Study Hours, While Traveling",Playlists,4,0,1,0,0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,2.0,"Office hours, Workout session, leisure time","recommendations, Playlists",4,0,1,1,0,2.0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
4,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,1.0,leisure time,"recommendations, Playlists",4,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,20-35,Female,3,Free (ad-supported),Yes,Music,1.0,Morning,2.0,While Traveling,recommendations,5,1,0,1,1,3.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1
516,20-35,Female,3,Free (ad-supported),Yes,Music,1.0,Night,4.0,"While Traveling, Workout session, leisure time",Others,2,1,0,1,0,2.0,0,0,0,0,0,1,0,0,0,0,1,1,1,1
517,20-35,Female,3,Free (ad-supported),No,Music,1.0,Night,1.0,leisure time,recommendations,3,0,0,1,0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
518,20-35,Female,1,Premium (paid subscription),Yes,Music,1.0,Night,2.0,"Workout session, leisure time","recommendations, Others",2,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0,1,0,0,1


### STEP 6 :  Enumerate the music_lis_frequency column and create a variety column

In [2003]:
# Cleaning, checking for unique values of music_lis_frequency	
unique_lis_frequency = relevant_df['music_lis_frequency'].unique()
unique_lis_frequency

array(['leisure time', 'Workout session', 'Study Hours, While Traveling',
       'Office hours, Workout session, leisure time', 'Office hours',
       'While Traveling, leisure time',
       'Office hours, While Traveling, Workout session',
       'Office hours, While Traveling',
       'Office hours, While Traveling, leisure time',
       'Social gatherings ', 'While Traveling',
       'While Traveling, Workout session, leisure time', 'Study Hours',
       'Study Hours, leisure time', 'While Traveling, Workout session',
       'Study Hours, Workout session, leisure time',
       'Office hours, While Traveling, ', 'Workout session, leisure time',
       'While Traveling, Workout session, leisure time, Night time, when cooking',
       'Office hours, While Traveling, Workout session, leisure time',
       'Study Hours, While Traveling, Workout session, leisure time',
       'Office hours, Study Hours, While Traveling, Workout session, leisure time',
       'Office hours, leisure time', 

In [2004]:
# Tackle this error Office hours,Study Hours - change it to this: Office hours, Study Hours
# Remove comma after the last leisure time entry
# remove comma after While Traveling,␣' 

# Define the replacement rules
def clean_music_frequency(frequency):
    if isinstance(frequency, str):  # Only process string entries (skip NaN/missing values)
        frequency = frequency.replace('hours,Study', 'hours, Study')
        frequency = frequency.replace('hours,Study', 'hours, Study')

    return frequency

# Apply the function to the column (corrected function name)
relevant_df['music_lis_frequency'] = relevant_df['music_lis_frequency'].apply(clean_music_frequency)

# Check unique values to verify changes
unique_lis_frequency = relevant_df['music_lis_frequency'].unique()
print(unique_lis_frequency)


['leisure time' 'Workout session' 'Study Hours, While Traveling'
 'Office hours, Workout session, leisure time' 'Office hours'
 'While Traveling, leisure time'
 'Office hours, While Traveling, Workout session'
 'Office hours, While Traveling'
 'Office hours, While Traveling, leisure time' 'Social gatherings '
 'While Traveling' 'While Traveling, Workout session, leisure time'
 'Study Hours' 'Study Hours, leisure time'
 'While Traveling, Workout session'
 'Study Hours, Workout session, leisure time'
 'Office hours, While Traveling, ' 'Workout session, leisure time'
 'While Traveling, Workout session, leisure time, Night time, when cooking'
 'Office hours, While Traveling, Workout session, leisure time'
 'Study Hours, While Traveling, Workout session, leisure time'
 'Office hours, Study Hours, While Traveling, Workout session, leisure time'
 'Office hours, leisure time' 'Study Hours, Workout session'
 'Office hours, Study Hours, While Traveling, leisure time'
 'Study Hours, While Traveli

In [2005]:
relevant_df

Unnamed: 0,Age,Gender,spotify_usage_period,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,music_genre_variety,music_time_slot,music_mood_variety,music_lis_frequency,music_expl_method,music_recc_rating,Computer or laptop,Smart speakers or voice assistants,Smartphone,Wearable devices,number_of_devices,Classical,Electronic,Kpop,Melody,Old songs,Pop,Rap,Rock,dance,trending songs random,Relaxation and stress relief,Sadness or melancholy,Social gatherings or parties,Uplifting and motivational
0,20-35,Female,3,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,leisure time,Playlists,3,0,1,0,0,1.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,12-20,Male,3,Free (ad-supported),Yes,Podcast,1.0,Afternoon,1.0,Workout session,Playlists,2,1,0,0,0,1.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,35-60,Others,1,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,"Study Hours, While Traveling",Playlists,4,0,1,0,0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,2.0,"Office hours, Workout session, leisure time","recommendations, Playlists",4,0,1,1,0,2.0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
4,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,1.0,leisure time,"recommendations, Playlists",4,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,20-35,Female,3,Free (ad-supported),Yes,Music,1.0,Morning,2.0,While Traveling,recommendations,5,1,0,1,1,3.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1
516,20-35,Female,3,Free (ad-supported),Yes,Music,1.0,Night,4.0,"While Traveling, Workout session, leisure time",Others,2,1,0,1,0,2.0,0,0,0,0,0,1,0,0,0,0,1,1,1,1
517,20-35,Female,3,Free (ad-supported),No,Music,1.0,Night,1.0,leisure time,recommendations,3,0,0,1,0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
518,20-35,Female,1,Premium (paid subscription),Yes,Music,1.0,Night,2.0,"Workout session, leisure time","recommendations, Others",2,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0,1,0,0,1


In [2006]:
# Cleaning listening frequency column Define a function to remove trailing comma (to remove the empty space column)
def remove_trailing_comma(text):
    if isinstance(text, str) and text.endswith(', '):
        return text[:-1]  # Slice off last character
    return text

# Apply to the column
relevant_df['music_lis_frequency'] = relevant_df['music_lis_frequency'].apply(remove_trailing_comma)
relevant_df

Unnamed: 0,Age,Gender,spotify_usage_period,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,music_genre_variety,music_time_slot,music_mood_variety,music_lis_frequency,music_expl_method,music_recc_rating,Computer or laptop,Smart speakers or voice assistants,Smartphone,Wearable devices,number_of_devices,Classical,Electronic,Kpop,Melody,Old songs,Pop,Rap,Rock,dance,trending songs random,Relaxation and stress relief,Sadness or melancholy,Social gatherings or parties,Uplifting and motivational
0,20-35,Female,3,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,leisure time,Playlists,3,0,1,0,0,1.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,12-20,Male,3,Free (ad-supported),Yes,Podcast,1.0,Afternoon,1.0,Workout session,Playlists,2,1,0,0,0,1.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,35-60,Others,1,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,"Study Hours, While Traveling",Playlists,4,0,1,0,0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,2.0,"Office hours, Workout session, leisure time","recommendations, Playlists",4,0,1,1,0,2.0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
4,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,1.0,leisure time,"recommendations, Playlists",4,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,20-35,Female,3,Free (ad-supported),Yes,Music,1.0,Morning,2.0,While Traveling,recommendations,5,1,0,1,1,3.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1
516,20-35,Female,3,Free (ad-supported),Yes,Music,1.0,Night,4.0,"While Traveling, Workout session, leisure time",Others,2,1,0,1,0,2.0,0,0,0,0,0,1,0,0,0,0,1,1,1,1
517,20-35,Female,3,Free (ad-supported),No,Music,1.0,Night,1.0,leisure time,recommendations,3,0,0,1,0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
518,20-35,Female,1,Premium (paid subscription),Yes,Music,1.0,Night,2.0,"Workout session, leisure time","recommendations, Others",2,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0,1,0,0,1


In [2007]:
#Create lists out of the music_lis_frequency' data to handle multiple cell data 
mlb = MultiLabelBinarizer()
list_frequency_data = relevant_df['music_lis_frequency'].str.split(', ')
list_frequency_data = pd.DataFrame(mlb.fit_transform(list_frequency_data), columns=mlb.classes_)
index=relevant_df.index         # Keep original DataFrame index
list_frequency_data.head()

# Merge back into the DataFrame:
relevant_df = pd.concat([relevant_df, list_frequency_data], axis=1)
relevant_df.head(50) 

# Count the number of favorite music data to create the fav_music_variety_column
relevant_df['music_lis_frequency'] = relevant_df['music_lis_frequency'].apply(
    lambda x: float(x.count(',') + 1)  # Count commas and add 1
)
# rename the music_lis_frequency column to music_genre_variety
relevant_df = relevant_df.rename(columns={'music_lis_frequency': 'listening_frequency_variety'})

#Display all the rows
pd.set_option('display.max_columns', None)

# Verify the results by displaying all columns
relevant_df.head() 

Unnamed: 0,Age,Gender,spotify_usage_period,spotify_subscription_plan,premium_sub_willingness,preferred_listening_content,music_genre_variety,music_time_slot,music_mood_variety,listening_frequency_variety,music_expl_method,music_recc_rating,Computer or laptop,Smart speakers or voice assistants,Smartphone,Wearable devices,number_of_devices,Classical,Electronic,Kpop,Melody,Old songs,Pop,Rap,Rock,dance,trending songs random,Relaxation and stress relief,Sadness or melancholy,Social gatherings or parties,Uplifting and motivational,Before bed,Night time,Office hours,Random,Social gatherings,Study Hours,While Traveling,"While Traveling,",Workout session,leisure time,"leisure time,",when cooking
0,20-35,Female,3,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,1.0,Playlists,3,0,1,0,0,1.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,12-20,Male,3,Free (ad-supported),Yes,Podcast,1.0,Afternoon,1.0,1.0,Playlists,2,1,0,0,0,1.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
2,35-60,Others,1,Free (ad-supported),Yes,Podcast,1.0,Night,1.0,2.0,Playlists,4,0,1,0,0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
3,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,2.0,3.0,"recommendations, Playlists",4,0,1,1,0,2.0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0
4,20-35,Female,2,Free (ad-supported),No,Music,1.0,Night,1.0,1.0,"recommendations, Playlists",4,0,0,1,0,1.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


### STEP 7: Music Exploration Method 

### STEP 8: combine spotify_subscription_plan	premium_sub_willingness  and create two tables 

### STEP 9: create two tables 


SUBSCRIBER VS NON SUBSCRIBER - GET RID OF THE WILLINGNESS COLUMN
do 4 way and compare willingness with resistant 
Create 3 tables on demographics, interaction, and content - comparing subscribers with non-subscribers 

### STEP 10: Get Dummies

spotify_subscription_plan	premium_sub_willingness

In [2008]:
### Step 2: Create the labels (`y`) from "premium_sub_willingness" and features (`X`) from the remaining columns.
# Separate the y variable (target: 'premium_sub_willingness')
y = df_spotify["premium_sub_willingness"].map({'Yes': 1, 'No': 0})  # Convert to binary

# Separate the X variable (features)
X = df_spotify.drop(columns=["premium_sub_willingness", "preffered_premium_plan"])  # Drop irrelevant columns

# Review the features
print("\nFeatures (X):")
print(X.head())


Features (X):
     Age  Gender spotify_usage_period  \
0  20-35  Female    More than 2 years   
1  12-20    Male    More than 2 years   
2  35-60  Others   6 months to 1 year   
3  20-35  Female    1 year to 2 years   
4  20-35  Female    1 year to 2 years   

                         spotify_listening_device spotify_subscription_plan  \
0              Smart speakers or voice assistants       Free (ad-supported)   
1                              Computer or laptop       Free (ad-supported)   
2              Smart speakers or voice assistants       Free (ad-supported)   
3  Smartphone, Smart speakers or voice assistants       Free (ad-supported)   
4                                      Smartphone       Free (ad-supported)   

  preferred_listening_content fav_music_genre music_time_slot  \
0                     Podcast          Melody           Night   
1                     Podcast             Rap       Afternoon   
2                     Podcast             Pop           Night   
3  

In [2009]:
### Step 3: Preprocess the data (encode categorical variables and scale numerical ones)
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


In [2010]:
## Create a Logistic Regression Model
### Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [2011]:
### Step 2: Build a pipeline with preprocessing and logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=1))
])

In [2012]:
# Fit the model
pipeline.fit(X_train, y_train)

In [2013]:
### Step 3: Make predictions
y_pred = pipeline.predict(X_test)

In [2014]:
### Step 4: Evaluate the model
# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[51 18]
 [ 9 26]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.74      0.79        69
           1       0.59      0.74      0.66        35

    accuracy                           0.74       104
   macro avg       0.72      0.74      0.72       104
weighted avg       0.76      0.74      0.75       104



In [2015]:
## Interpret Results
**Question:** How well does the model predict subscription willingness (`1` = Yes, `0` = No)?

**Answer:**  
- The classification report shows precision, recall, and F1-score for both classes.  
- Focus on the `1` (Yes) class to identify users likely to convert.  
- Example interpretation:  
  - **Recall (Sensitivity)**: If high (e.g., 0.85), the model captures 85% of potential subscribers.  
  - **Precision**: If high (e.g., 0.90), 90% of predicted "Yes" cases are correct.  
  - **F1-Score**: Balances precision and recall (aim for >0.7).  

# ---
## Key Adaptations for Your Dataset:
1. **Target Variable**: Binary `premium_sub_willingness` (Yes/No → 1/0).  
2. **Dropped Irrelevant Columns**: Removed `preffered_premium_plan` (leakage risk).  
3. **Preprocessing**:  
   - One-hot encoded categorical variables (e.g., `Gender`, `fav_music_genre`).  
   - Scaled numerical features (if any).  
4. **Pipeline**: Combined preprocessing + logistic regression for robustness.  

---

### **Next Steps**:
1. **Improve Model**: Try `RandomForestClassifier` or `XGBoost` for better performance.  
2. **Feature Importance**: Use `pipeline.named_steps['classifier'].coef_` to analyze key drivers.  
3. **Visualizations**: Plot a confusion matrix or ROC curve (use `sklearn.metrics.plot_roc_curve`).  

Let me know if you'd like help extending this (e.g., adding visualizations or deploying the model)!

SyntaxError: invalid character '→' (U+2192) (2864305138.py, line 14)