In [7]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [8]:
## Split the Data into Training and Testing Sets
### Step 1: Read the `Spotify_data.csv` data into a Pandas DataFrame.
file_path = Path("../Spotify/Resources/Spotify_data.csv")  # Update path if needed
df_spotify = pd.read_csv(file_path)

In [6]:
# Review the DataFrame
print("First 5 rows of the dataset:")
df_spotify.head()

First 5 rows of the dataset:


Unnamed: 0,Age,Gender,spotify_usage_period,spotify_listening_device,spotify_subscription_plan,premium_sub_willingness,preffered_premium_plan,preferred_listening_content,fav_music_genre,music_time_slot,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating,pod_lis_frequency,fav_pod_genre,preffered_pod_format,pod_host_preference,preffered_pod_duration,pod_variety_satisfaction
0,20-35,Female,More than 2 years,Smart speakers or voice assistants,Free (ad-supported),Yes,Family Plan-Rs 179/month,Podcast,Melody,Night,Sadness or melancholy,leisure time,Playlists,3,Daily,Comedy,Interview,Both,Both,Ok
1,12-20,Male,More than 2 years,Computer or laptop,Free (ad-supported),Yes,Individual Plan- Rs 119/ month,Podcast,Rap,Afternoon,Social gatherings or parties,Workout session,Playlists,2,Several times a week,Comedy,Interview,Both,,Satisfied
2,35-60,Others,6 months to 1 year,Smart speakers or voice assistants,Free (ad-supported),Yes,Student Plan-Rs 59/month,Podcast,Pop,Night,Relaxation and stress relief,"Study Hours, While Traveling",Playlists,4,Once a week,Sports,Interview,,Both,Satisfied
3,20-35,Female,1 year to 2 years,"Smartphone, Smart speakers or voice assistants",Free (ad-supported),No,,Music,Melody,Night,"Relaxation and stress relief, Social gathering...","Office hours, Workout session, leisure time","recommendations, Playlists",4,Never,,,,,Ok
4,20-35,Female,1 year to 2 years,Smartphone,Free (ad-supported),No,,Music,Melody,Night,Relaxation and stress relief,leisure time,"recommendations, Playlists",4,Rarely,Lifestyle and Health,Story telling,Well known individuals,Both,Ok


In [9]:
### Step 2: Create the labels (`y`) from "premium_sub_willingness" and features (`X`) from the remaining columns.
# Separate the y variable (target: 'premium_sub_willingness')
y = df_spotify["premium_sub_willingness"].map({'Yes': 1, 'No': 0})  # Convert to binary

# Separate the X variable (features)
X = df_spotify.drop(columns=["premium_sub_willingness", "preffered_premium_plan"])  # Drop irrelevant columns

# Review the features
print("\nFeatures (X):")
print(X.head())


Features (X):
     Age  Gender spotify_usage_period  \
0  20-35  Female    More than 2 years   
1  12-20    Male    More than 2 years   
2  35-60  Others   6 months to 1 year   
3  20-35  Female    1 year to 2 years   
4  20-35  Female    1 year to 2 years   

                         spotify_listening_device spotify_subscription_plan  \
0              Smart speakers or voice assistants       Free (ad-supported)   
1                              Computer or laptop       Free (ad-supported)   
2              Smart speakers or voice assistants       Free (ad-supported)   
3  Smartphone, Smart speakers or voice assistants       Free (ad-supported)   
4                                      Smartphone       Free (ad-supported)   

  preferred_listening_content fav_music_genre music_time_slot  \
0                     Podcast          Melody           Night   
1                     Podcast             Rap       Afternoon   
2                     Podcast             Pop           Night   
3  

In [10]:
### Step 3: Preprocess the data (encode categorical variables and scale numerical ones)
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


In [11]:
## Create a Logistic Regression Model
### Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
### Step 2: Build a pipeline with preprocessing and logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=1))
])

In [13]:
# Fit the model
pipeline.fit(X_train, y_train)

In [14]:
### Step 3: Make predictions
y_pred = pipeline.predict(X_test)

In [15]:
### Step 4: Evaluate the model
# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[51 18]
 [ 9 26]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.74      0.79        69
           1       0.59      0.74      0.66        35

    accuracy                           0.74       104
   macro avg       0.72      0.74      0.72       104
weighted avg       0.76      0.74      0.75       104



In [16]:
## Interpret Results
**Question:** How well does the model predict subscription willingness (`1` = Yes, `0` = No)?

**Answer:**  
- The classification report shows precision, recall, and F1-score for both classes.  
- Focus on the `1` (Yes) class to identify users likely to convert.  
- Example interpretation:  
  - **Recall (Sensitivity)**: If high (e.g., 0.85), the model captures 85% of potential subscribers.  
  - **Precision**: If high (e.g., 0.90), 90% of predicted "Yes" cases are correct.  
  - **F1-Score**: Balances precision and recall (aim for >0.7).  

# ---
## Key Adaptations for Your Dataset:
1. **Target Variable**: Binary `premium_sub_willingness` (Yes/No → 1/0).  
2. **Dropped Irrelevant Columns**: Removed `preffered_premium_plan` (leakage risk).  
3. **Preprocessing**:  
   - One-hot encoded categorical variables (e.g., `Gender`, `fav_music_genre`).  
   - Scaled numerical features (if any).  
4. **Pipeline**: Combined preprocessing + logistic regression for robustness.  

---

### **Next Steps**:
1. **Improve Model**: Try `RandomForestClassifier` or `XGBoost` for better performance.  
2. **Feature Importance**: Use `pipeline.named_steps['classifier'].coef_` to analyze key drivers.  
3. **Visualizations**: Plot a confusion matrix or ROC curve (use `sklearn.metrics.plot_roc_curve`).  

Let me know if you'd like help extending this (e.g., adding visualizations or deploying the model)!

SyntaxError: invalid character '→' (U+2192) (2864305138.py, line 14)