# Model Prediction: Random Forest using clusters for purchases and hospitals

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [2]:
# Read the file into a pandas DataFrame
df = pd.read_csv('consumo_material_clusters.csv')  
df.head(5)

Unnamed: 0,CODIGO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,PRODUCTO,DAY,MONTH,...,MONTH_NAME,TIPOCOMPRA_Compra menor,TIPOCOMPRA_Concurso,TGL_ALMACENABLE,TGL_TRANSITO,REGION,HOSPITAL,DEPARTMENT,PUR_CLUSTER,HOSP_CLUSTER
0,37,1595724/23,16,60,10,62.59,375.54,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,1,1,...,1-January,1,0,0,1,1,2,60,3,0
1,1,72714/16,38,40,10,102.803729,411.214916,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,1,2,...,2-February,1,0,1,0,0,10,1,1,1
2,5,71961/16,40,20,5,12.1,48.4,APOSITO DE HIDROFIBRA / CINTA-18,1,2,...,2-February,1,0,0,1,0,4,111,3,0
3,19,72773/16,18,100,50,215.325,430.65,APOSITO DE ESPUMA POLIURETANO / SACRO-11,1,2,...,2-February,0,1,1,0,0,10,1,2,1
4,14,86159/17,19,300,300,792.0,792.0,APOSITO C/ CARBON Y PLATA-6,1,2,...,2-February,0,1,1,0,0,6,1,2,1


In [3]:
# Assuming df is your DataFrame
# Combine 'DAY', 'MONTH', and 'YEAR' columns into a new 'date' column
df['DATE'] = df['DAY'].astype(str) + '-' + df['MONTH'].astype(str) + '-' + df['YEAR'].astype(str)

In [4]:
df.head(5)

Unnamed: 0,CODIGO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,PRODUCTO,DAY,MONTH,...,TIPOCOMPRA_Compra menor,TIPOCOMPRA_Concurso,TGL_ALMACENABLE,TGL_TRANSITO,REGION,HOSPITAL,DEPARTMENT,PUR_CLUSTER,HOSP_CLUSTER,DATE
0,37,1595724/23,16,60,10,62.59,375.54,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,1,1,...,1,0,0,1,1,2,60,3,0,1-1-23
1,1,72714/16,38,40,10,102.803729,411.214916,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,1,2,...,1,0,1,0,0,10,1,1,1,1-2-16
2,5,71961/16,40,20,5,12.1,48.4,APOSITO DE HIDROFIBRA / CINTA-18,1,2,...,1,0,0,1,0,4,111,3,0,1-2-16
3,19,72773/16,18,100,50,215.325,430.65,APOSITO DE ESPUMA POLIURETANO / SACRO-11,1,2,...,0,1,1,0,0,10,1,2,1,1-2-16
4,14,86159/17,19,300,300,792.0,792.0,APOSITO C/ CARBON Y PLATA-6,1,2,...,0,1,1,0,0,6,1,2,1,1-2-17


In [5]:
# Define the list of clusters
pur_clusters = df['PUR_CLUSTER'].unique()
hosp_clusters = df['HOSP_CLUSTER'].unique()

In [6]:
# Initialize an empty list to store predictions
predictions_list = []

# Open a text file for writing
with open('results.txt', 'w') as file:
    for pur_cluster in pur_clusters:
        for hosp_cluster in hosp_clusters:
            subset_df = df[(df['PUR_CLUSTER'] == pur_cluster) & (df['HOSP_CLUSTER'] == hosp_cluster)]

            # Check if there are enough samples to split into training and testing sets
            if len(subset_df) < 2:
                print(f"\nClusters: PUR_CLUSTER={pur_cluster}, HOSP_CLUSTER={hosp_cluster}", file=file)
                print("Insufficient data for splitting into training and testing sets.", file=file)
                continue

            X = subset_df[['CANTIDADCOMPRA', 'UNIDADESCONSUMOCONTENIDAS', 'DAY', 'MONTH', 'YEAR', 'TIPOCOMPRA_Compra menor',
                           'TIPOCOMPRA_Concurso', 'TGL_ALMACENABLE', 'TGL_TRANSITO', 'REGION', 'DEPARTMENT', 'PUR_CLUSTER', 'HOSP_CLUSTER']]
            y = subset_df['PRECIO']

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_regressor.fit(X_train, y_train)
            y_pred = rf_regressor.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)
            r_squared = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            max_price = subset_df['PRECIO'].max()
            min_price = subset_df['PRECIO'].min()

            # Append results to the predictions list
            predictions_list.append({
                'PUR_CLUSTER': pur_cluster,
                'HOSP_CLUSTER': hosp_cluster,
                'MSE': mse,
                'R_squared': r_squared,
                'MAE': mae,
                'Max_Price': max_price,
                'Min_Price': min_price,
                'y_test': y_test.tolist(),
            })

            # Write results to the text file
            print(f"\nClusters: PUR_CLUSTER={pur_cluster}, HOSP_CLUSTER={hosp_cluster}", file=file)
            print("Number of samples in training set:", len(X_train), file=file)
            print("Number of samples in test set:", len(X_test), file=file)
            print(f'Mean Squared Error on test set: {mse}', file=file)
            print(f'R-squared on test set: {r_squared}', file=file)
            print(f'Mean Absolute Error on test set: {mae}', file=file)
            print('Max Price: ' + str(max_price), file=file)
            print('Min Price: ' + str(min_price), file=file)
            print('y_test:', y_test.tolist(), file=file)

# Create a DataFrame from the predictions list
predictions_df = pd.DataFrame(predictions_list)

In [7]:
predictions_df.head(8)

Unnamed: 0,PUR_CLUSTER,HOSP_CLUSTER,MSE,R_squared,MAE,Max_Price,Min_Price,y_test
0,3,0,34280.281486,0.578793,51.655941,2458.5,9.88899,"[9.9, 12.1, 9.9, 324.148, 77.88, 62.7, 12.1, 1..."
1,1,1,2122.956779,0.887314,17.661955,792.0,0.989571,"[34.98, 5.24537, 20.9, 47.19, 17.27, 42.9, 1.2..."
2,2,0,0.37039,0.0,0.351373,62.595,9.88899,"[62.595, 62.595, 62.595]"
3,2,1,6121.741059,0.852036,26.869802,792.0,0.989571,"[354.754125, 539.0055, 27.280155, 5.3416, 5.65..."
4,0,1,3.043046,0.994852,0.378733,334.62,10.461204,"[60.69888, 48.4, 78.0032, 10.461204, 14.1, 10...."


In [8]:
# Save the predictions DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)