In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib
import sklearn

# Detect scikit-learn version
sparse_param = 'sparse_output' if sklearn.__version__ >= '1.0' else 'sparse'

# Load the dataset
file_path = 'bank-additional-full.csv'
df = pd.read_csv(file_path, delimiter=';')

# Drop 'duration' column
df = df.drop(columns=['duration'])

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.difference(['y'])
encoder = OneHotEncoder(drop='first', **{sparse_param: False}, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

# Combine numeric and encoded data
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
combined_df = pd.concat([df[numeric_cols], encoded_df], axis=1)

# Convert target variable to binary (1 for 'yes', 0 for 'no')
df['y'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Split data into features and target
X = combined_df
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Train logistic regression model with class weight adjustment
log_reg = LogisticRegression(max_iter=10000, random_state=42, solver='lbfgs', class_weight='balanced')
log_reg.fit(X_train, y_train)

# Save the necessary files
joblib.dump(log_reg, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(X_train.columns.tolist(), 'feature_order.pkl')  # Save feature order
print("Model, scaler, and feature order saved successfully!")


Model, scaler, and feature order saved successfully!
