In [1]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import custom_library as cl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [2]:
enlace_drive = "https://drive.google.com/uc?id=1c1l3OMUFjQWcydV0srKe95LLEfkBMKia"

# Cargar el archivo CSV en un DataFrame de pandas
original_data = pd.read_csv(enlace_drive)
modified_data = original_data.copy()
columns_to_keep = ['label', 'acousticness', 'instrumentalness', 'key', 'mode', 'time_signature']

In [3]:
# Step 1: 
binarization = FunctionTransformer(lambda data: cl.binarize_variables(data))

# Step 2: 
removal = FunctionTransformer(lambda data: data.drop(columns=columns_to_keep))

# Step 3: Handle outliers and missing data
handle_outliers = FunctionTransformer(lambda data: cl.handle_outliers(data, method='Imputacion', imputation_method='MICE', winsorization_rate=None))
imputation = FunctionTransformer(lambda data: cl.compute_missing_data(data, method='KNN', testing=True, percentage=0.05))

# Step 4: Transformation of variables
normalization = FunctionTransformer(lambda data: cl.transform_data(data, method='yeo-johnson', p_thres=0.05))

# Step 5: Dimensionality reduction
dimensionality = FunctionTransformer(lambda data: cl.reduce_dimensionality(data, method='Projection', corr_thres=0.8, var_thres=0.01, normality_thres=0.05, explained_var=0.93, do_ica=False))

# Step 6: 
def add_columns_back(modified_data, original_data, columns_to_keep):
    modified_data = pd.DataFrame(modified_data, columns=[col for col in original_data.columns if col not in columns_to_keep])
    result = pd.concat([modified_data.reset_index(drop=True), original_data[columns_to_keep].reset_index(drop=True)], axis=1)
    return result
addition = FunctionTransformer(lambda data: add_columns_back(data, original_data, columns_to_keep), validate=False)

In [4]:
pipeline = Pipeline(steps=[
    ('binarization', binarization),
    ('removal', removal),
    ('handle_outliers', handle_outliers),
    ('imputation', imputation),
    ('normalization', normalization),
    ('addition', addition)
])

In [5]:
transformed_data = pipeline.fit_transform(original_data)
transformed_data.describe()

Unnamed: 0,danceability,duration,energy,liveness,loudness,speechiness,tempo,valence,label,acousticness,instrumentalness,key,mode,time_signature
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,-1.077656e-16,-2.806644e-16,-3.481659e-16,5.37644e-16,2.984279e-16,-2.19084e-16,-3.718507e-16,8.408089e-17,0.602667,0.357394,0.100245,4.829333,0.741333,3.902667
std,1.000667,1.000667,1.000667,1.000667,1.000667,1.000667,1.000667,1.000667,0.489673,0.338405,0.259921,3.636001,0.438194,0.400091
min,-2.456003,-2.751513,-1.983064,-2.533777,-2.17356,-1.723473,-2.678377,-2.103318,0.0,1e-06,0.0,0.0,0.0,1.0
25%,-0.6877006,-0.6467967,-0.7418552,-0.7570056,-0.710768,-0.8646402,-0.7278264,-0.828533,0.0,0.03715,0.0,1.0,0.0,4.0
50%,0.01444718,0.01878049,0.04908488,-0.1506445,0.04245734,-0.1658211,0.05406492,0.005530534,1.0,0.2445,1e-05,5.0,1.0,4.0
75%,0.6473217,0.5726352,0.8144655,0.7538507,0.7282859,0.7703461,0.6956556,0.7894804,1.0,0.6785,0.002245,8.0,1.0,4.0
max,2.593526,2.700316,1.863444,2.033538,3.401593,1.987191,2.45496,1.947572,1.0,0.994,0.967,11.0,1.0,5.0


In [6]:
model = RandomForestClassifier(max_depth=5,n_estimators=25)
x = transformed_data.drop(columns=['label'])
y = transformed_data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model.fit(x_train, y_train) 
y_pred = model.predict(x_test) 
report = classification_report(y_test, y_pred, output_dict = True)
accuracy = report['accuracy']
print(classification_report(y_test, y_pred))
joblib.dump(model, 'random_forest_model.pkl')

              precision    recall  f1-score   support

           0       0.71      0.78      0.74        54
           1       0.87      0.82      0.84        96

    accuracy                           0.81       150
   macro avg       0.79      0.80      0.79       150
weighted avg       0.81      0.81      0.81       150



['random_forest_model.pkl']