In [19]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import custom_library as cl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [20]:
enlace_drive = "https://drive.google.com/uc?id=1c1l3OMUFjQWcydV0srKe95LLEfkBMKia"

# Cargar el archivo CSV en un DataFrame de pandas
original_data = pd.read_csv(enlace_drive, sep=',')
modified_data = original_data.copy()
columns_to_keep = ['acousticness', 'instrumentalness', 'key', 'mode', 'time_signature']

In [21]:
# Step 1: 
binarization = FunctionTransformer(lambda data: cl.binarize_variables(data))

# Step 2: 
removal = FunctionTransformer(lambda data: data.drop(columns=columns_to_keep))

# Step 3: Handle outliers and missing data
handle_outliers = FunctionTransformer(lambda data: cl.handle_outliers(data, method='Imputacion', imputation_method='MICE', winsorization_rate=None))
imputation = FunctionTransformer(lambda data: cl.compute_missing_data(data, method='KNN', testing=False, percentage=0.05))

# Step 4: Transformation of variables
normalization = FunctionTransformer(lambda data: cl.transform_data(data, method='yeo-johnson', p_thres=0.05))

# Step 5: Dimensionality reduction
dimensionality = FunctionTransformer(lambda data: cl.reduce_dimensionality(data, method='Projection', corr_thres=0.8, var_thres=0.01, normality_thres=0.05, explained_var=0.93, do_ica=False))

# Step 6: 
def add_columns_back(modified_data, original_data, columns_to_keep):
    modified_data = pd.DataFrame(modified_data, columns=[col for col in original_data.columns if col not in columns_to_keep])
    result = pd.concat([modified_data.reset_index(drop=True), original_data[columns_to_keep].reset_index(drop=True)], axis=1)
    return result
addition = FunctionTransformer(lambda data: add_columns_back(data, original_data, columns_to_keep), validate=False)

In [22]:
pipeline = Pipeline(steps=[
    ('binarization', binarization),
    ('removal', removal),
    ('handle_outliers', handle_outliers),
    ('imputation', imputation),
    ('normalization', normalization),
    ('addition', addition)
])

In [26]:
transformed_data = pipeline.fit_transform(original_data)
transformed_data

Debug stage 1
     acousticness  danceability  duration  energy  instrumentalness  key  \
0               1         0.514    100125   0.521                 1    8   
1               1         0.714    207019   0.614                 0    4   
2               1         0.630    216200   0.455                 0    5   
3               1         0.810    136413   0.221                 1    5   
4               1         0.465    181440   0.459                 0    6   
..            ...           ...       ...     ...               ...  ...   
745             0         0.374    333827   0.943                 0    6   
746             0         0.487    213000   0.867                 1   10   
747             1         0.605    125867   0.314                 0    0   
748             1         0.700    249493   0.823                 0    3   
749             0         0.477    276720   0.776                 0    1   

     liveness  loudness  mode  speechiness    tempo  time_signature  vale

Unnamed: 0,danceability,duration,energy,liveness,loudness,speechiness,tempo,valence,label,acousticness,instrumentalness,key,mode,time_signature
0,-0.533664,-2.682773,-0.397190,-0.441309,-1.687074,-0.428796,0.062657,-1.533119,0.811968,1,1,8,0,4
1,0.664566,-0.182082,-0.023569,1.160620,0.137475,-1.075377,0.210820,0.387298,0.811968,1,0,4,1,4
2,0.143027,0.036650,-0.645119,-0.203368,-0.508985,-1.237129,0.768346,-1.269981,0.811968,1,0,5,1,4
3,1.293054,-1.844918,-1.411903,0.512934,-0.914543,-0.498306,-0.306779,1.234688,0.811968,1,1,5,1,4
4,-0.804346,-0.788675,-0.630496,-1.271493,-0.207710,-0.894482,-1.067132,-0.863948,0.811968,1,0,6,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,-1.283168,-0.005367,1.532728,-0.233723,1.134141,0.030935,-0.226404,-0.641899,-1.231576,0,0,6,0,4
746,-0.683929,-0.039646,1.140103,-0.708464,1.495796,-0.017464,1.430271,-0.359265,-1.231576,0,1,10,0,4
747,-0.007079,-2.089811,-1.127818,1.629320,-0.066691,-1.029080,0.707596,1.378464,0.811968,1,0,0,1,4
748,0.575804,0.833875,0.921989,-0.280025,0.150510,-0.776551,0.907656,1.031632,0.811968,1,0,3,1,4


In [6]:
model = RandomForestClassifier(max_depth=5,n_estimators=25)
x = transformed_data.drop(columns=['label'])
y = transformed_data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model.fit(x_train, y_train) 
y_pred = model.predict(x_test) 
report = classification_report(y_test, y_pred, output_dict = True)
accuracy = report['accuracy']
print(classification_report(y_test, y_pred))
joblib.dump(model, 'random_forest_model.pkl')

              precision    recall  f1-score   support

           0       0.71      0.78      0.74        54
           1       0.87      0.82      0.84        96

    accuracy                           0.81       150
   macro avg       0.79      0.80      0.79       150
weighted avg       0.81      0.81      0.81       150



['random_forest_model.pkl']