In [3]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the penguins dataset
df = sns.load_dataset("penguins")
df.dropna(inplace=True)

# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy()

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded

# Display the filtered and encoded DataFrame
print(df_filtered[['species', 'class_encoded']])

# Split the data into features (X) and target variable (y)
y = df_filtered['class_encoded']   # Target variable
X = df_filtered.drop(['class_encoded'], axis=1)


       species  class_encoded
0       Adelie              0
1       Adelie              0
2       Adelie              0
4       Adelie              0
5       Adelie              0
..         ...            ...
215  Chinstrap              1
216  Chinstrap              1
217  Chinstrap              1
218  Chinstrap              1
219  Chinstrap              1

[214 rows x 2 columns]


In [52]:
# ---- Listing 2 (fixed with a single change) ----
# Keep ONLY numeric columns in X to avoid "could not convert string to float"
X = X.select_dtypes(include=['number'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the logistic regression model
# Here we are using saga solver to learn weights
logreg = LogisticRegression(solver='saga', max_iter=5000)
logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)


Accuracy: 0.7209302325581395
[[ 0.11055886 -0.00670331 -0.01178907 -0.00084805]] [-0.00057637]


In [55]:
# ---- Listing 2 (fixed with a single change) ----
# Keep ONLY numeric columns in X to avoid "could not convert string to float"
X = X.select_dtypes(include=['number'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the logistic regression model
# Here we are using saga solver to learn weights
logreg = LogisticRegression(solver='liblinear', max_iter=5000)
logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)


Accuracy: 1.0
[[ 1.5997544  -1.43222015 -0.15123497 -0.00401141]] [-0.07601213]


In [56]:
# ---- Listing 2 (Task 6 edition) ----
# Keep ONLY numeric columns in X to avoid "could not convert string to float"
X = X.select_dtypes(include=['number'])

import pandas as pd  # in case it's not already imported

# (A) Variation from TRAIN/TEST SPLIT (hold solver RNG fixed)
split_seeds = [0, 7, 21, 42, 77, 99, 123]
rows = []

for s in split_seeds:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=s, stratify=y
    )

    # Use saga for Q6; fix solver RNG so only the split changes
    logreg = LogisticRegression(solver='saga', max_iter=5000, random_state=0)
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    rows.append({"kind": "split_seed", "seed": s, "accuracy": acc})

print("=== Variation from TRAIN/TEST SPLIT (solver RNG fixed) ===")
df_split = pd.DataFrame(rows)
print(df_split)
print("Mean ± Std:", df_split["accuracy"].mean(), "±", df_split["accuracy"].std())

# (B) Variation from SOLVER RNG (hold the split fixed)
solver_seeds = [0, 7, 21, 42, 77, 99, 123]
rows = []

# Fix one split so only the solver RNG changes
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

for rs in solver_seeds:
    logreg = LogisticRegression(solver='saga', max_iter=5000, random_state=rs)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    rows.append({"kind": "solver_seed", "seed": rs, "accuracy": acc})

print("\n=== Variation from SOLVER RNG (split fixed) ===")
df_solver = pd.DataFrame(rows)
print(df_solver)
print("Mean ± Std:", df_solver["accuracy"].mean(), "±", df_solver["accuracy"].std())


=== Variation from TRAIN/TEST SPLIT (solver RNG fixed) ===
         kind  seed  accuracy
0  split_seed     0  0.790698
1  split_seed     7  0.790698
2  split_seed    21  0.813953
3  split_seed    42  0.813953
4  split_seed    77  0.767442
5  split_seed    99  0.790698
6  split_seed   123  0.767442
Mean ± Std: 0.7906976744186045 ± 0.018988292579714548

=== Variation from SOLVER RNG (split fixed) ===
          kind  seed  accuracy
0  solver_seed     0  0.813953
1  solver_seed     7  0.813953
2  solver_seed    21  0.813953
3  solver_seed    42  0.813953
4  solver_seed    77  0.813953
5  solver_seed    99  0.813953
6  solver_seed   123  0.813953
Mean ± Std: 0.8139534883720929 ± 1.199177923332954e-16


In [57]:
# ---- Task 7: Compare 'saga' vs 'liblinear' with and without StandardScaler ----
# Assumes X, y already exist and X is numeric-only (as in your fixed Listing 2)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# fixed split for comparability (same as your earlier runs)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

def run_solver(solver: str, scaled: bool):
    if scaled:
        model = Pipeline([
            ("scaler", StandardScaler()),
            ("lr", LogisticRegression(solver=solver, penalty="l2", max_iter=5000, random_state=0))
        ])
    else:
        model = LogisticRegression(solver=solver, penalty="l2", max_iter=5000, random_state=0)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    return acc

rows = []
for solver in ["saga", "liblinear"]:
    for scaled in [False, True]:
        acc = run_solver(solver, scaled)
        rows.append({
            "Solver": solver,
            "Scaled": "Yes" if scaled else "No",
            "Accuracy": acc
        })

results = pd.DataFrame(rows).sort_values(["Solver", "Scaled"])
print(results.to_string(index=False))


   Solver Scaled  Accuracy
liblinear     No  0.976744
liblinear    Yes  1.000000
     saga     No  0.813953
     saga    Yes  1.000000


In [59]:
# Q8 demo: Label encoding + scaling (incorrect for nominal) vs One-Hot (recommended)

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# ----- Synthetic dataset: color is NOMINAL; relation to y is NON-monotonic -----
rng = np.random.RandomState(0)
n = 600
colors = np.array(['red', 'blue', 'green'])
X_color = rng.choice(colors, size=n, p=[0.34, 0.33, 0.33])

# Non-monotonic mapping: say "red" and "blue" are class 1; "green" is class 0
y = ((X_color == 'red') | (X_color == 'blue')).astype(int)

# Add a numeric nuisance feature (to show proper scaling of numeric only)
x_num = rng.normal(loc=0.0, scale=1.0, size=n)

df = pd.DataFrame({'color': X_color, 'x_num': x_num, 'y': y})

# Pipeline A: LabelEncode color, then scale both columns (INCORRECT for nominal) 
le = LabelEncoder()
df['color_le'] = le.fit_transform(df['color'])
X_le = df[['color_le', 'x_num']]
y = df['y'].values

pipe_label = Pipeline([
    ('scaler', StandardScaler()),  # scales the label-encoded integers too (bad for nominal)
    ('lr', LogisticRegression(solver='liblinear', max_iter=2000, random_state=0))
])

# Pipeline B: One-Hot for color, scale numeric only (CORRECT approach) 
X_ohe = df[['color', 'x_num']]
pre = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), ['color']),
    ('num', StandardScaler(), ['x_num'])
], remainder='drop')

pipe_ohe = Pipeline([
    ('pre', pre),
    ('lr', LogisticRegression(solver='liblinear', max_iter=2000, random_state=0))
])

# 5-fold stratified CV to compare 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
acc_label = cross_val_score(pipe_label, X_le, y, cv=cv, scoring='accuracy')
acc_ohe   = cross_val_score(pipe_ohe,   X_ohe, y, cv=cv, scoring='accuracy')

print("Label-encode + scale  (mean ± std):", f"{acc_label.mean():.4f} ± {acc_label.std():.4f}")
print("One-Hot (num scaled)  (mean ± std):", f"{acc_ohe.mean():.4f} ± {acc_ohe.std():.4f}")
print("LabelEncoder mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


Label-encode + scale  (mean ± std): 0.6733 ± 0.0033
One-Hot (num scaled)  (mean ± std): 1.0000 ± 0.0000
LabelEncoder mapping: {'blue': 0, 'green': 1, 'red': 2}
