## Logistic Regression

In [2]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the penguins dataset
df = sns.load_dataset ("penguins")
df.dropna ( inplace = True )

# Filter rows for 'Adelie ' and 'Chinstrap ' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df [ df ['species']. isin ( selected_classes ) ].copy () # Make a copy to avoid the warning

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform ( df_filtered ['species'])
df_filtered ['class_encoded'] = y_encoded

# Display the filtered and encoded DataFrame
print ( df_filtered [[ 'species', 'class_encoded']])

# Split the data into features (X) and target variable (y)
y = df_filtered ['class_encoded'] # Target variable
X = df_filtered.drop ([ 'species', 'island', 'sex','class_encoded'] , axis =1)

       species  class_encoded
0       Adelie              0
1       Adelie              0
2       Adelie              0
4       Adelie              0
5       Adelie              0
..         ...            ...
215  Chinstrap              1
216  Chinstrap              1
217  Chinstrap              1
218  Chinstrap              1
219  Chinstrap              1

[214 rows x 2 columns]


In [7]:
# Split the data into training and testing sets\
#20% of the data will be used for testing, while 80% will be used for training.
#random_state=42: Ensures that the split is reproducible
X_train , X_test , y_train , y_test = train_test_split (X , y , test_size =0.2 , random_state =42)

#Train the logistic regression model . Here we are using saga solver to learn weights .

logreg = LogisticRegression ( solver ='saga')
logreg . fit ( X_train , y_train )

# Predict on the testing data
y_pred = logreg . predict ( X_test )

# Evaluate the model
accuracy = accuracy_score ( y_test , y_pred )
print (" Accuracy :", accuracy )
print ( logreg . coef_ , logreg . intercept_ )

 Accuracy : 0.5813953488372093
[[ 2.75422091e-03 -8.42325734e-05  4.37962916e-04 -2.85287112e-04]] [-8.62223605e-06]




In [8]:
# Split the data into training and testing sets
X_train , X_test , y_train , y_test = train_test_split (X , y , test_size =0.2 , random_state =42)

#Train the logistic regression model . Here we are using liblinear solver to learn weights .
logreg = LogisticRegression ( solver ='liblinear')
logreg . fit ( X_train , y_train )

# Predict on the testing data
y_pred = logreg . predict ( X_test )

# Evaluate the model
accuracy = accuracy_score ( y_test , y_pred )
print (" Accuracy :", accuracy )
print ( logreg . coef_ , logreg . intercept_ )

 Accuracy : 1.0
[[ 1.59665154 -1.42501103 -0.15238046 -0.003951  ]] [-0.0755452]


### Apply feature scaling

In [9]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Train With Feature Scaling

In [11]:
# Train and evaluate using 'liblinear' solver with feature scaling
liblinear_model_scaled = LogisticRegression(solver='liblinear')
liblinear_model_scaled.fit(X_train_scaled, y_train)
y_pred_liblinear_scaled = liblinear_model_scaled.predict(X_test_scaled)
accuracy_liblinear_scaled = accuracy_score(y_test, y_pred_liblinear_scaled)
print("Accuracy with scaling (liblinear):", accuracy_liblinear_scaled)

# Train and evaluate using 'saga' solver with feature scaling
saga_model_scaled = LogisticRegression(solver='saga', max_iter=5000)
saga_model_scaled.fit(X_train_scaled, y_train)
y_pred_saga_scaled = saga_model_scaled.predict(X_test_scaled)
accuracy_saga_scaled = accuracy_score(y_test, y_pred_saga_scaled)
print("Accuracy with scaling (saga):", accuracy_saga_scaled)

Accuracy with scaling (liblinear): 0.9767441860465116
Accuracy with scaling (saga): 0.9767441860465116


### Run the code given in listing 3. What is the problem of this code and how to solve this?

In [13]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the penguins dataset
df = sns.load_dataset("penguins")
df.dropna(inplace=True)

# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy() # Make a copy to avoid the warning

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded
df_filtered.head()

#X = df_filtered.drop(['species', 'class_encoded'], axis=1)
X = df_filtered.drop(['sex', 'island', 'species', 'class_encoded'], axis=1)
y = df_filtered['class_encoded'] # Target variable
X.head()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
logreg = LogisticRegression(solver='saga')
logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(logreg.coef_, logreg.intercept_)

Accuracy: 0.5813953488372093
[[ 2.75299765e-03 -8.25858555e-05  4.56505593e-04 -2.86199493e-04]] [-8.50529609e-06]


