In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Function to split and scale data
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

In [3]:
# Logistic Regression model and performance evaluation function
def logistic(X_train, y_train, X_test):       
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    
    # Predictions and metrics
    y_pred = classifier.predict(X_test)
    Accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    return classifier, Accuracy, report, X_test, y_test, cm

In [4]:
# Forward selection function using SequentialFeatureSelector
def forward_selection(indep_X, dep_Y, n):
    logistic_model = LogisticRegression(solver='saga', max_iter=1000)
    
    fs = SequentialFeatureSelector(estimator=logistic_model, n_features_to_select=n, direction='forward', scoring='accuracy', cv=5, n_jobs=-1)
    
    fs.fit(indep_X, dep_Y)
    X_new = fs.transform(indep_X)
    
    selected_features = indep_X.columns[fs.get_support()]
    
    return X_new, selected_features

In [5]:
# Read and prepare dataset
dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = pd.get_dummies(dataset1, drop_first=True)
indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

In [6]:
# Apply forward selection
n_features = 5  # Example: Select top 5 features
X_new, selected_features = forward_selection(indep_X, dep_Y, n_features)
selected_features

Index(['al', 'hrmo', 'rc', 'sg_c', 'sg_d'], dtype='object')

In [7]:
# Split data and apply scaling
X_train, X_test, y_train, y_test = split_scalar(X_new, dep_Y)

In [8]:
# Train the logistic model and evaluate
classifier, Accuracy, report, X_test, y_test, cm = logistic(X_train, y_train, X_test)

In [9]:
acclog = [Accuracy]

In [13]:
acclog

[0.97]