In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with correct delimiter
df = pd.read_csv('banking.csv')

# Display initial dataset information
print(df.info())
print(df.describe())

# Handling missing values
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Encoding categorical variables
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
num_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

ohe = OneHotEncoder(drop='first', sparse_output=False)
categorical_encoded = ohe.fit_transform(df_imputed[categorical_cols])
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=ohe.get_feature_names_out())

# Combine processed features
df_processed = pd.concat([df_imputed[num_cols], categorical_encoded_df], axis=1)

y = df_imputed['y'].map({'no': 0, 'yes': 1})  # Convert target to binary
X_train, X_test, y_train, y_test = train_test_split(df_processed, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)

print("SVM Model Performance:")
print(classification_report(y_test, svm_pred))

# Train Logistic Regression (LR) model
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

print("Logistic Regression Model Performance:")
print(classification_report(y_test, lr_pred))

# Discussion
print("Findings:")
print("Compare accuracy and classification reports of both models to discuss effectiveness.")

FileNotFoundError: [Errno 2] No such file or directory: 'banking.csv'