In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Exercise Classification

 - Import aps_failure_training_set.csv file and aps_failure_test_set.csv

 - Using training dataset, train a NN to predict the class. It indicates if there is failure in the trucks.

 - When using common accuracy metrics we know if our model is good, but is it good enough? We will define a new specific metric for this problem: **Total cost**. For each truck we say it fails, the company sends a mechanic to review the truck, which supposes a cost of 10. On the other hand, if we say there is not failure when they actually are, the truck breakdowns, which supposes a cost of 500. In summary, False positives cost 10, and False negatives cost 500.

 - Train several NN and keep the one with less total costs. Your goal is to achieve a Total cost lower than 1

 - The evaluation phase (Total cost calculation) must be done using the test dataset (aps_failure_test_set.csv)

 - Below some pieces of code that can help you complete the exercise, specially the last one, where the definition of the Total cost is

In [2]:
import plotly
import plotly.graph_objs as go
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler

In [3]:
data1 = pd.read_csv('/content/drive/MyDrive/Ironhack/25octubre/aps_failure_training_set.csv')
data1.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,neg,33058,,0.0,,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,neg,41040,,228.0,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,neg,12,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,neg,60874,,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [4]:
data2 = pd.read_csv('/content/drive/MyDrive/Ironhack/25octubre/aps_failure_test_set.csv')
data2.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60,0.0,20.0,12.0,0.0,0.0,0.0,0.0,0.0,...,1098.0,138.0,412.0,654.0,78.0,88.0,0.0,0.0,0.0,0.0
1,neg,82,0.0,68.0,40.0,0.0,0.0,0.0,0.0,0.0,...,1068.0,276.0,1620.0,116.0,86.0,462.0,0.0,0.0,0.0,0.0
2,neg,66002,2.0,212.0,112.0,0.0,0.0,0.0,0.0,0.0,...,495076.0,380368.0,440134.0,269556.0,1315022.0,153680.0,516.0,0.0,0.0,0.0
3,neg,59816,,1010.0,936.0,0.0,0.0,0.0,0.0,0.0,...,540820.0,243270.0,483302.0,485332.0,431376.0,210074.0,281662.0,3232.0,0.0,0.0
4,neg,1814,,156.0,140.0,0.0,0.0,0.0,0.0,0.0,...,7646.0,4144.0,18466.0,49782.0,3176.0,482.0,76.0,0.0,0.0,0.0


In [5]:
# Código para pintar gráfico con el porcentaje de valores perdidos por variable
NULL_RATIO_TRHESHOLD = 0 # Set the null ratio threshold required


null_ratios = (data1.isnull().sum() / data1.shape[0])
null_ratios_over_threshold = null_ratios[null_ratios > NULL_RATIO_TRHESHOLD].sort_values(ascending=False)

data_go = [
    go.Bar(
        x=null_ratios_over_threshold.index,
        y=null_ratios_over_threshold
    )
]

fig = go.Figure(data=data_go, layout={
    "title": "Null Ratio for Features with Null Ratio Exceeding {}".format(NULL_RATIO_TRHESHOLD)
})

plotly.offline.iplot(fig)

In [6]:
null_ratios_over_threshold = null_ratios[null_ratios > NULL_RATIO_TRHESHOLD].sort_values(ascending=False)
nan_columns = list(null_ratios_over_threshold[null_ratios_over_threshold>0.1].index)

In [7]:
# eliminamos las variables con demaisados missing, y corregimos por la media los que solo tienen hasta un 10% de valores perdidos
# 1. Manejo de columnas no numéricas con valores faltantes
non_numeric_columns = data1.select_dtypes(exclude=['number']).columns
for column in non_numeric_columns:
    data1[column].fillna('desconocido', inplace=True)  # Reemplaza los valores faltantes con "desconocido" u otra etiqueta apropiada

# 2. Rellenar columnas numéricas con la media
numeric_columns = data1.select_dtypes(include=['number']).columns
for column in numeric_columns:
    data1[column].fillna(data1[column].mean(), inplace=True)

# 3. Eliminar columnas con alto porcentaje de valores faltantes
nan_percentage = (data1.isnull().sum() / len(data1)) * 100
columns_to_drop = nan_percentage[nan_percentage > 10].index
data1.drop(columns_to_drop, axis=1, inplace=True)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data1.iloc[:,0:-1],
                                                    data1.iloc[:,-1].astype(int), train_size = 0.8, random_state = 0)
X_train.head(3)

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_001,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000
5778,neg,60660,0.713189,266.0,230.0,0.0,0.0,0.0,0.0,0.0,...,374628.0,273744.0,145088.0,311782.0,371428.0,371870.0,381522.0,700624.0,9062.0,0.0
5287,neg,3412,0.713189,86.0,76.0,0.0,0.0,0.0,0.0,0.0,...,18240.0,9260.0,4462.0,11122.0,9132.0,11944.0,15364.0,70760.0,1712.0,0.0
57167,neg,29664,0.713189,0.0,190620.639314,0.0,0.0,0.0,0.0,0.0,...,599514.0,409912.0,222004.0,393608.0,182968.0,97552.0,45586.0,30968.0,268.0,0.0


In [9]:
# Para equilibrar la variable respuesta
ros = RandomOverSampler(random_state=42)
X_train, y_train= ros.fit_resample(X_train, y_train)

In [None]:
scaler = StandardScaler()
sc = scaler.fit(X_train)

In [None]:
train_sc = sc.transform(X_train)
X_train_sc = pd.DataFrame(train_sc)
X_train_sc.columns = X_train.columns

test_sc = sc.transform(X_test)
X_test_sc = pd.DataFrame(test_sc)
X_test_sc.columns = X_test.columns

In [None]:
print(X_train_sc.shape)
print(X_test_sc.shape)

In [None]:
# Escalamos los datos de validación con los parámetros de los del conjunto de entrenamiento
val_sc = sc.transform(X_val)
X_val_sc = pd.DataFrame(val_sc)
X_val_sc.columns = X_val.columns

In [None]:
X_val = data_val.iloc[:,0:-1]
y_val = data_val.iloc[:,-1]

In [None]:
# código para representar la matriz de confusión a partir de la predicción de la red entrenada
# En la última línea tenemos cómo calcular el Total Cost
predictions = MLP_Clas.predict(X_test_sc, verbose = 0).round(0)
conf_mat = confusion_matrix(y_test, predictions)
print(tabulate(conf_mat,headers = ['pred breackdown No','pred breackdown Yes'], showindex = ['real breackdown No','real breackdown Yes'],
               tablefmt = 'fancy_grid'))

print(classification_report(y_test, predictions))

print("Total cost: {}".format((conf_mat[1][0] * 500 + conf_mat[0][1] * 10) / X_test_sc.shape[0]))

## Exercise Regression

 - Import medical_score_train.csv and medical_score_test.csv

 - Using training dataset, train a NN for medical score prediction

 - Your goal is to achieve a MAE lower than 8