In [18]:
import os
import re
import numpy as np
import pandas as pd


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [19]:
DATASETS_DIR = './data/'
URL = 'C:/Users/rbernal/Documents/GitHub/Proyecto/FAE/data/data_fire.csv'
RETRIEVED_DATA = 'data_fire.csv'


SEED_SPLIT = 404
TRAIN_DATA_FILE = DATASETS_DIR + 'train.csv'
TEST_DATA_FILE  = DATASETS_DIR + 'test.csv'

TARGET  = 'STATUS'
FEATURES = ['SIZE','FUEL','DISTANCE','DESIBEL','AIRFLOW','FREQUENCY']
CATEGORICAL_VARS = ['FUEL']
NUMERICAL_VARS = ['SIZE','DISTANCE','DESIBEL','AIRFLOW','FREQUENCY']

SEED_MODEL = 404

In [20]:
def retrieve_data(self):

# Loading data from specific path
    data = pd.read_csv(url) 

    # Create directory if it does not exist
    if not os.path.exists(self.DATASETS_DIR):
            os.makedirs(self.DATASETS_DIR)
            print(f"Directory '{self.DATASETS_DIR}' created successfully.")
        else:
            print(f"Directory '{self.DATASETS_DIR}' already exists.")

    # Save data to CSV file
    data.to_csv(self.DATASETS_DIR + self.RETRIEVED_DATA, index=False)

    return f'Data stored in {self.DATASETS_DIR + self.RETRIEVED_DATA}'

In [22]:
df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
                                                        df.drop(TARGET, axis=1),
                                                        df[TARGET],
                                                        test_size=0.2,
                                                        random_state=404
                                                   )

X_train.to_csv(TRAIN_DATA_FILE, index=False)
X_test.to_csv(TEST_DATA_FILE, index=False)
y_test.to_csv('y_test.csv', index=False)

___
## Creating convenient classes

### Transformations without persisting information

**Before**

```python
X_train = pd.concat([X_train, pd.get_dummies(X_train[CATEGORICAL_VARS], drop_first=True)], 1)
X_test  = pd.concat([X_test, pd.get_dummies(X_test[CATEGORICAL_VARS], drop_first=True)], 1)

X_train.drop(CATEGORICAL_VARS, 1, inplace=True)
X_test.drop(CATEGORICAL_VARS, 1, inplace=True)

# Validation step
set(X_train.columns).difference(set(X_test.columns))

for col in list(set(X_train.columns).difference(set(X_test.columns))):
    X_test[col] = 0
```

**Now**

In [None]:
class OneHotEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.dummies = pd.get_dummies(X[self.variables], drop_first=True).columns
        return self

    def transform(self, X):
        X = X.copy()
        X = pd.concat([X, pd.get_dummies(X[self.variables], drop_first=True)], axis=1)
        X.drop(self.variables, axis=1)

        # Adding missing dummies, if any
        missing_dummies = [var for var in self.dummies if var not in X.columns]
        if len(missing_dummies) != 0:
            for col in missing_dummies:
                X[col] = 0

        return X


dummy_vars = OneHotEncoder(variables=CATEGORICAL_VARS)
dummy_vars.fit(X_train)
X_train = dummy_vars.transform(X_train)
X_test  = dummy_vars.transform(X_test)

**Aligning columns of X_train and X_test**

In [None]:
class OrderingFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        self.ordered_features = X.columns
        return self

    def transform(self, X):
        return X[self.ordered_features]


sort_feats = OrderingFeatures()
sort_feats.fit(X_train)
X_train = sort_feats.transform(X_train)
X_test  = sort_feats.transform(X_test)