# RANDOM FOREST REGRESSION.

### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score, mean_squared_error, mean_absolute_error
from pandas import DataFrame
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

## Funciones Auxiliares

In [2]:
# Función para dividir el conjunto de datos
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return train_set, val_set, test_set

In [3]:
# Función para separar características y etiquetas
def remove_labels(df, label_name):
    X = df.drop(label_name, axis=1)
    y = df[label_name].copy()
    return X, y

## Lectura del DataSet

In [4]:
df = pd.read_csv("AndroidAdware2017/TotalFeatures-ISCXFlowMeter.csv")

### Exploracion del DataSet.

In [5]:
df.head()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
0,1020586,668,1641,35692,2276876,52,52,679,1390,53.431138,...,0.0,-1,0.0,2,4194240,1853440,1640,668,32,benign
1,80794,1,1,75,124,75,124,75,124,75.0,...,0.0,-1,0.0,2,0,0,0,1,0,benign
2,998,3,0,187,0,52,-1,83,-1,62.333333,...,0.0,-1,0.0,4,101888,-1,0,3,32,benign
3,189868,9,9,1448,6200,52,52,706,1390,160.888889,...,0.0,-1,0.0,2,4194240,2722560,8,9,32,benign
4,110577,4,6,528,1422,52,52,331,1005,132.0,...,0.0,-1,0.0,2,155136,31232,5,4,32,benign


In [6]:
df.describe()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
count,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,...,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0
mean,21952450.0,6.728514,10.431934,954.0172,12060.42,141.475727,44.357688,263.675901,183.248084,174.959706,...,19973270.0,20312280.0,20752380.0,466387.5,2.360896,962079.6,310451.9,9.733144,6.72471,19.965713
std,190057800.0,174.161354,349.424019,82350.4,482471.6,157.68088,89.099554,289.644383,371.863224,162.024811,...,189798600.0,189790200.0,189972100.0,6199704.0,3.04181,1705655.0,664795.6,347.877923,174.13813,14.914261
min,-18.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,2.0,-1.0,-1.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,69.0,0.0,52.0,-1.0,52.0,-1.0,52.0,...,-1.0,0.0,-1.0,0.0,2.0,0.0,-1.0,0.0,1.0,0.0
50%,24450.0,1.0,0.0,184.0,0.0,52.0,-1.0,83.0,-1.0,83.0,...,-1.0,0.0,-1.0,0.0,2.0,87616.0,-1.0,0.0,1.0,32.0
75%,1759751.0,3.0,1.0,427.0,167.0,108.0,52.0,421.0,115.0,356.0,...,1013498.0,1291379.0,1306116.0,0.0,2.0,304640.0,90496.0,1.0,3.0,32.0
max,44310760000.0,48255.0,74768.0,40496440.0,103922200.0,1390.0,1390.0,1500.0,1390.0,1390.0,...,44310720000.0,44300000000.0,44310720000.0,847000000.0,2269.0,4194240.0,4194240.0,74524.0,48255.0,44.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631955 entries, 0 to 631954
Data columns (total 80 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   duration                 631955 non-null  int64  
 1   total_fpackets           631955 non-null  int64  
 2   total_bpackets           631955 non-null  int64  
 3   total_fpktl              631955 non-null  int64  
 4   total_bpktl              631955 non-null  int64  
 5   min_fpktl                631955 non-null  int64  
 6   min_bpktl                631955 non-null  int64  
 7   max_fpktl                631955 non-null  int64  
 8   max_bpktl                631955 non-null  int64  
 9   mean_fpktl               631955 non-null  float64
 10  mean_bpktl               631955 non-null  float64
 11  std_fpktl                631955 non-null  float64
 12  std_bpktl                631955 non-null  float64
 13  total_fiat               631955 non-null  int64  
 14  tota

In [8]:
# Visualización y descripción del conjunto de datos
print("Longitud del DataSet:", len(df))
print("Número de características del DataSet:", len(df.columns))

Longitud del DataSet: 631955
Número de características del DataSet: 80


## Coversion de etiquetas de la variable objetivo

In [9]:
# Transformar la variable objetivo (etiquetas) a numérica
df["calss"] = df["calss"].factorize()[0]

## Division del DataSet.

In [10]:
train_set, val_set, test_set = train_val_test_split(df, stratify="calss")

In [11]:
# Separación de características y etiquetas para cada conjunto
X_train, y_train = remove_labels(train_set, "calss")
X_val, y_val = remove_labels(val_set, "calss")
X_test, y_test = remove_labels(test_set, "calss")

## Escalado de caracteristicas

In [12]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## Conjunto Datos a DF.

In [13]:
X_train_scaled = DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Modelo Random Forest para clasificación sin escalado

In [14]:
clf_rnd = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train, y_train)

# Modelo Random Forest para clasificación con escalado

In [15]:
clf_rnd_scaled = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd_scaled.fit(X_train_scaled, y_train)

# Evaluación de los modelos de clasificación

In [16]:
def evaluate_result(y_pred, y, y_prep_pred, y_prep, metric):
    print(metric.__name__, "SIN escalado:", metric(y_pred, y, average="weighted"))
    print(metric.__name__, "CON escalado:", metric(y_prep_pred, y_prep, average="weighted"))

# Predicciones y evaluación de clasificación

In [17]:
y_train_pred = clf_rnd.predict(X_train)
y_train_prep_pred = clf_rnd_scaled.predict(X_train_scaled)

In [18]:
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
# Evaluación de clasificación sin escalado y con escalado
evaluate_result(y_train_pred, y_train, y_train_prep_pred, y_train, f1_score)

f1_score SIN escalado: 0.9812305622683857
f1_score CON escalado: 0.9811846879189869


In [19]:
y_val_pred = clf_rnd.predict(X_val)
y_val_prep_pred = clf_rnd_scaled.predict(X_val_scaled)

In [20]:
# Evaluación para el conjunto de validación
evaluate_result(y_val_pred, y_val, y_val_prep_pred, y_val, f1_score)

f1_score SIN escalado: 0.9329344485207943
f1_score CON escalado: 0.9325035661187642


In [21]:
from sklearn.ensemble import RandomForestRegressor

# Modelo de regresión con Random Forest (sin escalado)
clf_rnd_reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd_reg.fit(X_train, y_train)

In [22]:
# Modelo de regresión con Random Forest (con escalado)
clf_rnd_reg_scaled = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd_reg_scaled.fit(X_train_scaled, y_train)


In [23]:
# Predicciones de regresión para el conjunto de entrenamiento
y_train_reg_pred = clf_rnd_reg.predict(X_train)
y_train_scaled_reg_pred = clf_rnd_reg_scaled.predict(X_train_scaled)

In [24]:
# Predicciones de regresión para el conjunto de validación
y_val_reg_pred = clf_rnd_reg.predict(X_val)
y_val_scaled_reg_pred = clf_rnd_reg_scaled.predict(X_val_scaled)

In [25]:
# Cálculo de métricas de regresión (RMSE y MAE)
train_rmse = mean_squared_error(y_train, y_train_reg_pred, squared=False)
val_rmse = mean_squared_error(y_val, y_val_reg_pred, squared=False)

train_mae = mean_absolute_error(y_train, y_train_reg_pred)
val_mae = mean_absolute_error(y_val, y_val_reg_pred)

train_scaled_rmse = mean_squared_error(y_train, y_train_scaled_reg_pred, squared=False)
val_scaled_rmse = mean_squared_error(y_val, y_val_scaled_reg_pred, squared=False)

train_scaled_mae = mean_absolute_error(y_train, y_train_scaled_reg_pred)
val_scaled_mae = mean_absolute_error(y_val, y_val_scaled_reg_pred)

## Validaciones

In [26]:
print("\nSin Escalado:")
print("RMSE (Train Set):", train_rmse)
print("RMSE (Validation Set):", val_rmse)
print("MAE (Train Set):", train_mae)
print("MAE (Validation Set):", val_mae)

print("\nCon Escalado:")
print("RMSE (Train Set):", train_scaled_rmse)
print("RMSE (Validation Set):", val_scaled_rmse)
print("MAE (Train Set):", train_scaled_mae)
print("MAE (Validation Set):", val_scaled_mae)


Sin Escalado:
RMSE (Train Set): 0.14133010539665553
RMSE (Validation Set): 0.2369317508275916
MAE (Train Set): 0.04830799212691874
MAE (Validation Set): 0.08776182722366305

Con Escalado:
RMSE (Train Set): 0.14130703896691513
RMSE (Validation Set): 0.2369438902524357
MAE (Train Set): 0.04829978426252633
MAE (Validation Set): 0.08772944379396007


In [27]:
# Predicciones en el conjunto de prueba (test set)
y_test_pred = clf_rnd.predict(X_test)
y_test_prep_pred = clf_rnd_scaled.predict(X_test_scaled)

In [28]:
# Evaluación final en el conjunto de prueba para clasificación
evaluate_result(y_test_pred, y_test, y_test_prep_pred, y_test, f1_score)

f1_score SIN escalado: 0.9348159654880637
f1_score CON escalado: 0.9344953918990812


In [29]:
# Predicciones de regresión para el conjunto de prueba
y_test_reg_pred = clf_rnd_reg.predict(X_test)
y_test_scaled_reg_pred = clf_rnd_reg_scaled.predict(X_test_scaled)

In [30]:
# Cálculo de métricas de regresión para el conjunto de prueba
test_rmse = mean_squared_error(y_test, y_test_reg_pred, squared=False)
test_scaled_rmse = mean_squared_error(y_test, y_test_scaled_reg_pred, squared=False)

In [31]:
test_mae = mean_absolute_error(y_test, y_test_reg_pred)
test_scaled_mae = mean_absolute_error(y_test, y_test_scaled_reg_pred)

## RESULTADOS

In [32]:
print("\nResultados de Regresión en el Test Set:")
print("RMSE (Test Set):", test_rmse)
print("RMSE (Test Set) con Escalado:", test_scaled_rmse)
print("MAE (Test Set):", test_mae)
print("MAE (Test Set) con Escalado:", test_scaled_mae)


Resultados de Regresión en el Test Set:
RMSE (Test Set): 0.23305325274113609
RMSE (Test Set) con Escalado: 0.2331455193577875
MAE (Test Set): 0.08564740459238591
MAE (Test Set) con Escalado: 0.08569339266087475
