In [2]:
import os
import warnings
import pandas as pd 
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

def load_data(file_name):
    curr_file = os.path.join(os.getcwd().replace('processing', 'data'), file_name)
    return pd.read_csv(curr_file, delimiter=",")

# Reload the dataset with the correct delimiter
data = load_data('lock_data.csv')

# Display the first few rows of the dataset
data.head()

# Splitting the dataset into features (X) and target (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Standardizing the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

data_standardized = pd.concat([pd.DataFrame(X_standardized), y], axis=1)
data_standardized.columns = data.columns

# Save file to CSV
data_standardized.to_csv('lock_data_standardized.csv', index=False)

# Apply SMOTE to oversample the positive class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_standardized, y)

# Checking the distribution of the target variable after SMOTE
y_resampled.value_counts()

resampled_df = pd.concat([pd.DataFrame(X_resampled), y_resampled], axis=1)
resampled_df.columns = data.columns

# Save the balanced dataset to a CSV file
resampled_df.to_csv('lock_data_standardized_smote.csv', index=False)