In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif


In [2]:
df = pd.read_csv("/content/avenged_songs.csv")


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   album           132 non-null    object 
 1   name            132 non-null    object 
 2   popularity      132 non-null    int64  
 3   release_date    132 non-null    object 
 4   track_number    132 non-null    int64  
 5   energy          132 non-null    float64
 6   tempo           132 non-null    float64
 7   time_signature  132 non-null    int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.4+ KB


In [4]:
df.head()


Unnamed: 0,album,name,popularity,release_date,track_number,energy,tempo,time_signature
0,Life Is But a Dream,Game Over,70,6/2/2023,1,0.841,87.01,4
1,Life Is But a Dream,Mattel,70,6/2/2023,2,0.822,127.1,4
2,Life Is But a Dream,Nobody,70,6/2/2023,3,0.677,73.762,4
3,Life Is But a Dream,We Love You,67,6/2/2023,4,0.813,140.034,4
4,Life Is But a Dream,Cosmic,69,6/2/2023,5,0.722,127.02,4


In [5]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].dt.year
df.head()


Unnamed: 0,album,name,popularity,release_date,track_number,energy,tempo,time_signature,year
0,Life Is But a Dream,Game Over,70,2023-06-02,1,0.841,87.01,4,2023
1,Life Is But a Dream,Mattel,70,2023-06-02,2,0.822,127.1,4,2023
2,Life Is But a Dream,Nobody,70,2023-06-02,3,0.677,73.762,4,2023
3,Life Is But a Dream,We Love You,67,2023-06-02,4,0.813,140.034,4,2023
4,Life Is But a Dream,Cosmic,69,2023-06-02,5,0.722,127.02,4,2023


In [6]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

df.isnull().sum()


Unnamed: 0,0
album,0
name,0
popularity,0
release_date,0
track_number,0
energy,0
tempo,0
time_signature,0
year,0


In [7]:
df = pd.get_dummies(df, columns=['album', 'name'], drop_first=True)
df.head()


Unnamed: 0,popularity,release_date,track_number,energy,tempo,time_signature,year,album_City of Evil,album_Diamonds in the Rough,album_Hail to the King,...,name_Victim,name_Waking The Fallen: Resurrected,name_Waking the Fallen,name_Walk,name_Walk - Live,name_Warmness On The Soul,name_We Come Out At Night,name_We Love You,name_Welcome to the Family,name_Wish You Were Here
0,70.0,2023-06-02,1.0,0.841,87.01,4.0,2023,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,70.0,2023-06-02,2.0,0.822,127.1,4.0,2023,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,70.0,2023-06-02,3.0,0.677,73.762,4.0,2023,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,67.0,2023-06-02,4.0,0.813,140.034,4.0,2023,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,69.0,2023-06-02,5.0,0.722,127.02,4.0,2023,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df[num_cols].head()


Unnamed: 0,popularity,track_number,energy,tempo,time_signature
0,1.569673,-1.408483,-0.228956,-1.501347,0.254
1,1.569673,-1.192548,-0.368977,-0.130626,0.254
2,1.569673,-0.976613,-1.437551,-1.954311,0.254
3,1.291432,-0.760679,-0.435302,0.311601,0.254
4,1.476926,-0.544744,-1.105925,-0.133362,0.254


In [9]:
df['era'] = df['year'].apply(lambda x: 0 if x <= 2013 else 1)
y = df['era']
X = df.drop(['era', 'release_date'], axis=1)


In [10]:
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)
print("PCA variance explained:", pca.explained_variance_ratio_)


PCA variance explained: [0.88805926 0.02757742 0.02347585 0.01647667 0.01077004]


In [11]:
selector = SelectKBest(score_func=f_classif, k=5)
X_kbest = selector.fit_transform(X, y)
print("Selected feature indices:", selector.get_support(indices=True))


Selected feature indices: [ 0  5  7 10 13]


In [12]:
print("Original Shape:", df.shape)
print("After PCA:", X_pca.shape)
print("After SelectKBest:", X_kbest.shape)


Original Shape: (132, 149)
After PCA: (132, 5)
After SelectKBest: (132, 5)
