In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
file_path = r"/Users/ramonmora/Desktop/TKH/Supervised-Learning-Project/clean_data.csv"
# Importing the data with the correct column names
df = pd.read_csv(file_path, sep=',')

In [3]:
random_sample  =df.sample(n=500000)

In [4]:
# Defining numerical and categorical features
numeric_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
categorical_features = ['type']  # 'type' is a categorical feature in the dataset

# Defining the preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Convert numeric values in categorical features to strings
df[categorical_features] = df[categorical_features].astype(str)

# Apply the preprocessing to the sample
X_sample = random_sample.drop('isFraud',axis=1)
y_sample = random_sample['isFraud']
preprocessor.fit(X_sample)
processed_X_sample = preprocessor.transform(X_sample)

# Converting the processed sample back to DataFrame and adding the target column
numeric_features_processed = [f"{feature}_scaled" for feature in numeric_features]
categorical_features_encoded = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = numeric_features_processed + list(categorical_features_encoded)
processed_sample_df = pd.DataFrame(processed_X_sample, columns=all_features)
processed_sample_df['isFraud'] = y_sample.reset_index(drop=True)

# Saving the data to a CSV file
processed_sample_df.to_csv('process_data_sample.csv',index=False)