In [2]:
import pandas as pd
import numpy as np



# 🎵 **Spotify Feature Engineering ** 🎵
## 📜 Explanation of Selected Feature Columns  

### 1️⃣ **`duration_min`**  
This feature converts the song duration from **seconds to minutes** (`duration / 60`). It helps in analyzing **average song length** and trends over time. Some genres may have longer or shorter songs, influencing user listening behavior.

### 2️⃣ **`popularity_category`**  
The `popularity` column is categorized into three levels:  
- **Low (0-30)**  
- **Medium (31-70)**  
- **High (71-100)**  

This transformation groups songs based on their popularity trends, making it easier to **compare top hits with lesser-known tracks**. It is especially useful in **trend analysis and recommendation systems**.

### 3️⃣ **`log_streams`**  
Applying a **log transformation** to the number of streams helps handle **highly skewed data**, making patterns clearer. Since a few songs receive significantly more streams than others, **normalizing this data helps with fair comparisons** and avoids bias from extreme values.

### 4️⃣ **`song_age`**  
This feature calculates the **age of the song** from its release year using:  
`current_year - release_year`.  

It helps analyze whether **older songs still receive streams** or if **recent releases dominate**. This is useful for identifying **long-term trends** in music consumption.




In [9]:

# Load dataset
df = pd.read_csv("new_cleanedd_spotify_songs.csv")  # Replace with your actual file name


In [10]:
# Convert duration to minutes
df["duration_min"] = df["duration"] / 60




In [11]:
# Categorize popularity
conditions = [
    (df["popularity"] <= 30),
    (df["popularity"] > 30) & (df["popularity"] <= 70),
    (df["popularity"] > 70)
]
categories = ["Low", "Medium", "High"]
df["popularity_category"] = np.select(conditions, categories, default="Unknown")



In [12]:
# Log transformation of streams to reduce skewness
df["log_streams"] = np.log1p(df["stream"])  # Using log1p to avoid log(0)


In [13]:
# Calculate song age
df["release_year"] = pd.to_datetime(df["release_date"]).dt.year


In [14]:
df["song_age"] = 2024 - df["release_year"]  # Assuming the current year is 2024


In [20]:
# Impute missing values with Median
df['duration'] = df['duration'].fillna(df['duration'].median())


In [21]:



# Impute missing values with Median
df['duration_min'] = df['duration_min'].fillna(df['duration_min'].median())



In [23]:
df['language'] = df['language'].fillna(df['language'].mode())


In [24]:
df.head()

Unnamed: 0,song_id,song_title,artist,album,genre,release_date,duration,popularity,stream,language,explicit_content,label,composer,producer,duration_min,popularity_category,log_streams,release_year,song_age
0,SP0001,space executive series.,sydney clark,what.,electronic,1997-11-08,282.0,42.0,35055874.0,english,Yes,def jam,amy hatfield,jeffrey weaver,4.7,Medium,17.372454,1997,27
1,SP0002,price last painting.,connor peters dds,nature politics.,electronic,2015-05-10,127.0,50.0,9249527.0,english,Yes,universal music,jason gregory,kenneth white,2.116667,Medium,16.040083,2015,9
2,SP0003,piece.,anna keith,visit.,pop,2024-07-08,240.0,10.0,76669110.0,english,Yes,universal music,rachel lopez,jason barnes,4.0,Low,18.155009,2024,0
3,SP0004,power industry your.,zachary simpson,behavior evening.,hip-hop,2022-08-15,214.0,86.0,34732016.0,english,No,sony music,thomas li,mrs. becky palmer,3.566667,High,17.363172,2022,2
4,SP0005,food animal second.,christopher mcgee,front.,pop,2023-03-05,273.0,63.0,96649372.0,english,Yes,def jam,adam wagner,beverly baker,4.55,Medium,18.3866,2023,1


In [25]:
# Save the processed dataset
df.to_csv("new_spotify_featured_data.csv", index=False)


In [26]:
df.head()

Unnamed: 0,song_id,song_title,artist,album,genre,release_date,duration,popularity,stream,language,explicit_content,label,composer,producer,duration_min,popularity_category,log_streams,release_year,song_age
0,SP0001,space executive series.,sydney clark,what.,electronic,1997-11-08,282.0,42.0,35055874.0,english,Yes,def jam,amy hatfield,jeffrey weaver,4.7,Medium,17.372454,1997,27
1,SP0002,price last painting.,connor peters dds,nature politics.,electronic,2015-05-10,127.0,50.0,9249527.0,english,Yes,universal music,jason gregory,kenneth white,2.116667,Medium,16.040083,2015,9
2,SP0003,piece.,anna keith,visit.,pop,2024-07-08,240.0,10.0,76669110.0,english,Yes,universal music,rachel lopez,jason barnes,4.0,Low,18.155009,2024,0
3,SP0004,power industry your.,zachary simpson,behavior evening.,hip-hop,2022-08-15,214.0,86.0,34732016.0,english,No,sony music,thomas li,mrs. becky palmer,3.566667,High,17.363172,2022,2
4,SP0005,food animal second.,christopher mcgee,front.,pop,2023-03-05,273.0,63.0,96649372.0,english,Yes,def jam,adam wagner,beverly baker,4.55,Medium,18.3866,2023,1
