In [28]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import custom_library as cl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [38]:
enlace_drive = "https://drive.google.com/uc?id=1c1l3OMUFjQWcydV0srKe95LLEfkBMKia"

# Cargar el archivo CSV en un DataFrame de pandas
original_data = pd.read_csv(enlace_drive, sep=',')
unmodified_data = original_data.copy()
columns_to_keep = ['label', 'acousticness', 'instrumentalness', 'key', 'mode', 'time_signature']

In [42]:
# Step 1: 
binarization = FunctionTransformer(lambda data: cl.binarize_variables(data))

# Step 2: 
removal = FunctionTransformer(lambda data: data.drop(columns=columns_to_keep))

# Step 3: Handle outliers and missing data
handle_outliers = FunctionTransformer(lambda data: cl.handle_outliers(data, method='Imputacion', imputation_method='MICE', winsorization_rate=None))
imputation = FunctionTransformer(lambda data: cl.compute_missing_data(data, method='KNN', testing=False, percentage=0.05))

# Step 4: Transformation of variables
normalization = FunctionTransformer(lambda data: cl.transform_data(data, method='yeo-johnson', p_thres=0.05))

# Step 5: Dimensionality reduction
dimensionality = FunctionTransformer(lambda data: cl.reduce_dimensionality(data, method='Projection', corr_thres=0.8, var_thres=0.01, normality_thres=0.05, explained_var=0.93, do_ica=False))

# Step 6: 
def add_columns_back(modified_data, original_data, columns_to_keep):
        print("PLEASE SEND HELP 4")
        print(type(modified_data))
        print(modified_data.shape)
        print(modified_data)
        print(modified_data.columns)
        result = pd.DataFrame(modified_data, columns=[col for col in original_data.columns if col not in columns_to_keep])
        print("PLEASE SEND HELP 5")
        print(type(result))
        print(result.shape)
        print(result)
        result = pd.concat([result.reset_index(drop=True), original_data[columns_to_keep].reset_index(drop=True)], axis=1)
        print("PLEASE SEND HELP 5")
        print(type(result))
        print(result.shape)
        print(result)
        result = result.drop('label', axis = 1)
        print("PLEASE SEND HELP 5")
        print(type(result))
        print(result.shape)
        print(result)
        return result
addition = FunctionTransformer(lambda data: add_columns_back(data, unmodified_data, columns_to_keep), validate=False)

In [43]:
pipeline = Pipeline(steps=[
    ('binarization', binarization),
    ('removal', removal),
    ('handle_outliers', handle_outliers),
    ('imputation', imputation),
    ('normalization', normalization),
    ('addition', addition)
])

In [44]:
transformed_data = pipeline.fit_transform(original_data)
transformed_data.columns

Debug stage 1
     acousticness  danceability  duration  energy  instrumentalness  key  \
0               1         0.514    100125   0.521                 1    8   
1               1         0.714    207019   0.614                 0    4   
2               1         0.630    216200   0.455                 0    5   
3               1         0.810    136413   0.221                 1    5   
4               1         0.465    181440   0.459                 0    6   
..            ...           ...       ...     ...               ...  ...   
745             0         0.374    333827   0.943                 0    6   
746             0         0.487    213000   0.867                 1   10   
747             1         0.605    125867   0.314                 0    0   
748             1         0.700    249493   0.823                 0    3   
749             0         0.477    276720   0.776                 0    1   

     liveness  loudness  mode  speechiness    tempo  time_signature  vale

Index(['danceability', 'duration', 'energy', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'acousticness', 'instrumentalness',
       'key', 'mode', 'time_signature'],
      dtype='object')

In [6]:
model = RandomForestClassifier(max_depth=5,n_estimators=25)
x = transformed_data.drop(columns=['label'])
y = transformed_data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model.fit(x_train, y_train) 
y_pred = model.predict(x_test) 
report = classification_report(y_test, y_pred, output_dict = True)
accuracy = report['accuracy']
print(classification_report(y_test, y_pred))
joblib.dump(model, 'random_forest_model.pkl')

              precision    recall  f1-score   support

           0       0.71      0.78      0.74        54
           1       0.87      0.82      0.84        96

    accuracy                           0.81       150
   macro avg       0.79      0.80      0.79       150
weighted avg       0.81      0.81      0.81       150



['random_forest_model.pkl']