# Data Preprocessing

In [701]:
import pandas as pd
import numpy as np
import glob
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

warnings.filterwarnings("ignore", category=FutureWarning, message=".*use_inf_as_na.*")  

#### Read training and testing datasets

In [702]:
df = pd.read_csv('Combined_train.csv')
df_test = pd.read_csv('testing-instances.csv')

#### Encoding the Time Signature column correctly for EDA

In [703]:
def encode_time_signature(date_format):
    #formats to correct time signatures
    time_signature_correction = {
        '04-Apr': 4/4,
        '03-Apr': 3/4,
        '01-Apr': 1/4,
        '05-Apr': 5/4,
        '0/4' : 0/4
    }
    if date_format in time_signature_correction:
        return time_signature_correction[date_format]
    else:
        return date_format

#For encoding time_signature column to correct format
df['time_signature'] = pd.to_numeric(df['time_signature'].apply(encode_time_signature))
df_test['time_signature'] = pd.to_numeric(df_test['time_signature'].apply(encode_time_signature))

#### Handling the missing values

In [704]:
#Converting tempo column to numeric
df['tempo'] = pd.to_numeric(df['tempo'], errors='coerce')
df['duration_ms'] = df['duration_ms'].replace(-1, np.nan)
#df['artist_name'] = df['artist_name'].replace('empty_field', np.nan)
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 instance_id             0
artist_name             0
track_name              0
track_id                0
popularity              0
acousticness            0
danceability            0
duration_ms         10114
energy                  0
instrumentalness        0
key                     0
liveness                0
loudness                0
mode                    0
speechiness             0
tempo                7501
time_signature          0
valence                 0
genre                   0
dtype: int64


In [705]:
df_new= df.copy()
numeric_cols = df_new.select_dtypes(include=[np.number]).columns
categorical_cols = df_new.select_dtypes(include=['object']).columns

In [706]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer()
numeric_data_imputed = imputer.fit_transform(df_new[numeric_cols])
numeric_data_imputed = pd.DataFrame(numeric_data_imputed, columns=numeric_cols)
categorical_data = df_new[categorical_cols]

In [707]:
df_new_comp = pd.concat([numeric_data_imputed, categorical_data], axis=1)

# Print the number of missing values in the new DataFrame
print("The number of missing values:\n", df_new_comp.isnull().sum())

The number of missing values:
 instance_id         0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
time_signature      0
valence             0
artist_name         0
track_name          0
track_id            0
key                 0
mode                0
genre               0
dtype: int64


In [708]:
df_test['tempo'] = pd.to_numeric(df_test['tempo'], errors='coerce')
df_test['duration_ms'] = df_test['duration_ms'].replace(-1, np.nan)
missing_values = df_test.isnull().sum()

print("Missing Values:\n", missing_values)

Missing Values:
 instance_id            0
artist_name            0
track_name             0
track_id               0
popularity             0
acousticness           0
danceability           0
duration_ms         2515
energy                 0
instrumentalness       0
key                    0
liveness               0
loudness               0
mode                   0
speechiness            0
tempo               1874
time_signature         0
valence                0
dtype: int64


In [710]:
numeric_cols = df_test.select_dtypes(include=[np.number]).columns
categorical_cols = df_test.select_dtypes(include=['object']).columns

In [711]:
numeric_data_imputed = imputer.transform(df_test[numeric_cols])
numeric_data_imputed = pd.DataFrame(numeric_data_imputed, columns=numeric_cols)
categorical_data = df_test[categorical_cols]
df_test_comp = pd.concat([numeric_data_imputed, categorical_data], axis=1)

print("The number of missing values:\n", df_test_comp.isnull().sum())

The number of missing values:
 instance_id         0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
time_signature      0
valence             0
artist_name         0
track_name          0
track_id            0
key                 0
mode                0
dtype: int64


#### Writing Clean data in file

In [713]:
df_new_comp.to_csv('Cleaned_data.csv', index=False)
df_test_comp.to_csv('Cleaned_test_data.csv', index=False)

In [672]:
df_new = pd.read_csv('Cleaned_data.csv')
df_test = pd.read_csv('Cleaned_test_data.csv')

#### Feature Engineering of Data from gained Insights

##### Bimodal nature of Instrumentalness and Acousticness

In [714]:
from sklearn.mixture import GaussianMixture
instrumentalness = df_new[['instrumentalness']].dropna()

# Fit Gaussian Mixture Model
gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(instrumentalness)
df_new['instrumentalness_mode'] = gmm.predict(instrumentalness)
df_new['instrumentalness_low'] = df_new['instrumentalness'].where(df_new['instrumentalness_mode'] == 0, 0)
df_new['instrumentalness_high'] = df_new['instrumentalness'].where(df_new['instrumentalness_mode'] == 1, 0)
df_new.drop(columns=['instrumentalness'], inplace=True)

In [715]:
instrumentalness_test = df_test[['instrumentalness']].dropna()
instrumentalness_mode_test = gmm.predict(instrumentalness_test)
df_test['instrumentalness_mode'] = instrumentalness_mode_test
df_test['instrumentalness_low'] = df_test['instrumentalness'].where(df_test['instrumentalness_mode'] == 0, 0)
df_test['instrumentalness_high'] = df_test['instrumentalness'].where(df_test['instrumentalness_mode'] == 1, 0)

# Drop the original feature if necessary
df_test.drop(columns=['instrumentalness'], inplace=True)

In [716]:

acousticness = df_new[['acousticness']].dropna()
# Fit Gaussian Mixture Model for acousticness
gmm_acousticness = GaussianMixture(n_components=2, random_state=42)
gmm_acousticness.fit(acousticness)

# Predict and create new columns based on the mixture components
df_new['acousticness_mode'] = gmm_acousticness.predict(acousticness)
df_new['acousticness_low'] = df_new['acousticness'].where(df_new['acousticness_mode'] == 0, 0)
df_new['acousticness_high'] = df_new['acousticness'].where(df_new['acousticness_mode'] == 1, 0)

# Drop the original acousticness column if necessary
df_new.drop(columns=['acousticness'], inplace=True)

# Handle the acousticness column for test data
acousticness_test = df_test[['acousticness']].dropna()
acousticness_mode_test = gmm_acousticness.predict(acousticness_test)

df_test['acousticness_mode'] = acousticness_mode_test
df_test['acousticness_low'] = df_test['acousticness'].where(df_test['acousticness_mode'] == 0, 0)
df_test['acousticness_high'] = df_test['acousticness'].where(df_test['acousticness_mode'] == 1, 0)

# Drop the original acousticness column in the test set if necessary
df_test.drop(columns=['acousticness'], inplace=True)

##### Skewness and Kurtosis

In [717]:
from sklearn.preprocessing import PowerTransformer

# Initialize PowerTransformer for Box-Cox
pt = PowerTransformer(method='box-cox', standardize=False)

# Shift features in the training data
df_new['liveness_shifted'] = df_new['liveness'] + (1 - df_new['liveness'].min() if df_new['liveness'].min() <= 0 else 0)
df_new['loudness_shifted'] = df_new['loudness'] + (1 - df_new['loudness'].min() if df_new['loudness'].min() <= 0 else 0)
df_new['speechiness_shifted'] = df_new['speechiness'] + (1 - df_new['speechiness'].min() if df_new['speechiness'].min() <= 0 else 0)

pt.fit(df_new[['liveness_shifted', 'loudness_shifted', 'speechiness_shifted']])

# Apply transformation to training data
df_new[['bc_liveness', 'bc_loudness', 'bc_speechiness']] = pt.transform(df_new[['liveness_shifted', 'loudness_shifted', 'speechiness_shifted']])


In [718]:
df_test['liveness_shifted'] = df_test['liveness'] + (1 - df_new['liveness'].min() if df_test['liveness'].min() <= 0 else 0)
df_test['loudness_shifted'] = df_test['loudness'] + (1 - df_new['loudness'].min() if df_test['loudness'].min() <= 0 else 0)
df_test['speechiness_shifted'] = df_test['speechiness'] + (1 - df_new['speechiness'].min() if df_test['speechiness'].min() <= 0 else 0)

df_test[['bc_liveness', 'bc_loudness', 'bc_speechiness']] = pt.transform(df_test[['liveness_shifted', 'loudness_shifted', 'speechiness_shifted']])


In [719]:
df_new.drop(columns=['liveness_shifted', 'loudness_shifted', 'speechiness_shifted','liveness','loudness','speechiness'], inplace=True)
df_test.drop(columns=['liveness_shifted', 'loudness_shifted', 'speechiness_shifted','liveness','loudness','speechiness'], inplace=True)

#### Writing Final Transformed Data into Files

In [720]:
df_new.to_csv('Cleaned_final_data.csv', index=False)
df_test.to_csv('Cleaned_final_test_data.csv', index=False)