# Libraries

In [38]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load Data

In [39]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Split data

In [41]:
# Separate the target variable from the features
X_train = train.drop('fraude', axis=1)  # Features
y_train = train['fraude']  # Target variable

In [42]:
# Split the testing data into features and target variable
X_test = test.drop('fraude', axis=1)  # Features
y_test = test['fraude']  # Target variable

# Feature Engineering

In [44]:
numeric_features = ['tempo', 'valor', 'saldo_inicial_c1', 'novo_saldo_c1',
       'saldo_inicial_c2', 'novo_saldo_c2']
categorical_features = ['tipo']

In [45]:
preprocessing_pipeline = ColumnTransformer([
    ('numeric', StandardScaler(), numeric_features),
    ('categorical', OneHotEncoder(), categorical_features)
])

In [46]:
# Fit the preprocessing pipeline to the training data
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_train_processed = pd.DataFrame(X_train_processed, 
                                columns = numeric_features + 
                                list(preprocessing_pipeline.named_transformers_['categorical'].get_feature_names_out(categorical_features)))

# Apply the same preprocessing to the testing data
X_test_processed = preprocessing_pipeline.transform(X_test)
X_test_processed = pd.DataFrame(X_test_processed, 
                                columns = numeric_features + 
                                list(preprocessing_pipeline.named_transformers_['categorical'].get_feature_names_out(categorical_features)))

# Save processed data

In [49]:
X_train_processed.to_csv('../data/preprocessed_data/X_train.csv', index = False)
X_test_processed.to_csv('../data/preprocessed_data/X_test.csv', index = False)
y_train.to_csv('../data/preprocessed_data/y_train.csv', index = False)
y_test.to_csv('../data/preprocessed_data/y_test.csv', index = False)