In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = "Preliminary college year.xlsx"
data = pd.read_excel(file_path)

# Remove rows with missing target variable
data.dropna(subset=['Retained F17-F18? (1=yes, 0=no)'], inplace=True)

# Drop unnecessary columns
columns_to_drop = ['Federal Ethnic Group', 'Gender', 'Reason for not Completing Connect', 'Reason not Retained']
data.drop(columns=columns_to_drop, inplace=True)

# Define features (X) and target variable (y)
X = data.drop(columns=['Retained F17-F18? (1=yes, 0=no)'])
y = data['Retained F17-F18? (1=yes, 0=no)']

# Perform one-hot encoding for categorical variables
categorical_columns = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define hyperparameters for logistic regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # Use a solver compatible with 'l1' and 'l2'
}

# Train logistic regression model with hyperparameter tuning
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_logistic_model = grid_search.best_estimator_

# Predict target variable for the testing set
y_pred = best_logistic_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9545454545454546
