In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Define file paths
file_path = r"C:\Users\samia\OneDrive\Documents\GitHub\springboard\Project Proposal\bank+marketing\bank-additional\bank-additional\bank-additional-full.csv"
file_path1 = r"C:\Users\samia\OneDrive\Documents\GitHub\springboard\Project Proposal\bank+marketing\bank-additional\bank-additional\bank-additional.csv"

# Check if files exist
print("Does the full dataset file exist?", os.path.exists(file_path))
print("Does the additional dataset file exist?", os.path.exists(file_path1))

# Load datasets if they exist
if os.path.exists(file_path):
    bank_additional_full = pd.read_csv(file_path)
else:
    print(f"File not found: {file_path}")

if os.path.exists(file_path1):
    bank_full = pd.read_csv(file_path1)
else:
    print(f"File not found: {file_path1}")

# Check the data from the first dataset
if 'bank_additional_full' in locals():
    print(bank_additional_full.head())

# Step 1: Identify categorical variables
categorical_columns = bank_additional_full.select_dtypes(include=['object']).columns

# Step 2: Create dummy/indicator features for categorical variables
bank_additional_full = pd.get_dummies(bank_additional_full, columns=categorical_columns, drop_first=True)

# Step 3: Identify numeric columns
numeric_columns = bank_additional_full.select_dtypes(include=['int64', 'float64']).columns

# Check if there are numeric columns before applying scaling
print(f"Numeric columns: {numeric_columns}")

# Step 4: Handle missing values (if any) before scaling
# You can fill missing values with the mean (or another appropriate value)
bank_additional_full[numeric_columns] = bank_additional_full[numeric_columns].fillna(bank_additional_full[numeric_columns].mean())

# Step 5: Standardize numeric features
scaler = StandardScaler()

# Check if there are numeric columns to scale
if len(numeric_columns) > 0:
    bank_additional_full[numeric_columns] = scaler.fit_transform(bank_additional_full[numeric_columns])
else:
    print("No numeric columns found to scale.")

# Step 6: Split data into training and testing datasets
X = bank_additional_full.drop('y', axis=1)  # Assuming 'y' is the target variable
y = bank_additional_full['y']  # The target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")
