<a href="https://colab.research.google.com/github/Selani-Indrapala/Logistic-Regression/blob/main/Part02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Load the penguins dataset
df = sns.load_dataset("penguins")

df.dropna(inplace=True)

In [None]:
# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy()  # Make a copy to avoid the warning

In [None]:
# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded

df_filtered.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,class_encoded
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,0


In [None]:
X = df_filtered.drop(['species', 'class_encoded'], axis=1)  # Choose features
y = df_filtered['class_encoded']  # Target variable

X.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Torgersen,39.3,20.6,190.0,3650.0,Male


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
logreg = LogisticRegression(solver='saga')
logreg.fit(X_train, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)

ValueError: ignored

**Since 'Dream' (the island column) is a string it cannot be classified. Only numerical data can be used for classification.**

**Therefore, in order to classify, the column can either be removed or encoded into numerical values.**

In [None]:
df_filtered = pd.get_dummies(df_filtered, columns=['island', 'sex'], drop_first=True)
df_filtered.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,class_encoded,island_Dream,island_Torgersen,sex_Male
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,0,1,1


In [None]:
samples = df_filtered.groupby('sex_Male').head(1)
print(samples)
print()
samples = df_filtered.groupby('island_Torgersen').head(1)
print(samples)
print()
samples = df_filtered.groupby('island_Dream').head(1)
print(samples)

  species  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0  Adelie            39.1           18.7              181.0       3750.0   
1  Adelie            39.5           17.4              186.0       3800.0   

   class_encoded  island_Dream  island_Torgersen  sex_Male  
0              0             0                 1         1  
1              0             0                 1         0  

   species  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0   Adelie            39.1           18.7              181.0       3750.0   
20  Adelie            37.8           18.3              174.0       3400.0   

    class_encoded  island_Dream  island_Torgersen  sex_Male  
0               0             0                 1         1  
20              0             0                 0         0  

   species  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0   Adelie            39.1           18.7              181.0       3750.0   
30  Adelie    

In [None]:
X = df_filtered.drop(['species','class_encoded'], axis=1)

y = df_filtered['class_encoded']  # Target variable
print(X.shape, y.shape)
X.head()

(214, 7) (214,)


Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_Male
0,39.1,18.7,181.0,3750.0,0,1,1
1,39.5,17.4,186.0,3800.0,0,1,0
2,40.3,18.0,195.0,3250.0,0,1,0
4,36.7,19.3,193.0,3450.0,0,1,0
5,39.3,20.6,190.0,3650.0,0,1,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import MaxAbsScaler
scaler=MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(solver='saga',max_iter=150,)

In [None]:
logreg.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred = logreg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(logreg.coef_, logreg.intercept_)

Accuracy: 1.0
[[ 3.63408049  0.16294682  0.62610668  0.10206241  2.59919256 -0.87722219
  -0.35902914]] [-5.99575023]


**Some features, such as the island_dream, island_torgerson and sex_male are binary coded with either 0 or 1. Using MaxAbsScalar would retain these values but if we were to use Standard Scalar then these values would change according to the data distribution.**

In [None]:
from sklearn.preprocessing import MaxAbsScaler
scaler=MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled)

[[0.59655172 0.98139535 0.93396226 0.91666667 0.         1.
  1.        ]
 [0.8862069  0.88372093 0.94811321 0.82291667 1.         0.
  1.        ]
 [0.68275862 0.8        0.9245283  0.73958333 0.         1.
  0.        ]
 [0.87586207 0.86046512 0.94811321 0.92708333 1.         0.
  1.        ]
 [0.7137931  0.86046512 0.95283019 0.80729167 0.         1.
  1.        ]
 [0.64310345 0.95348837 0.93867925 0.78645833 0.         1.
  1.        ]
 [0.65172414 0.85116279 0.82075472 0.70833333 0.         0.
  0.        ]
 [0.5862069  0.79534884 0.87264151 0.70833333 1.         0.
  0.        ]
 [0.73965517 0.81860465 0.9245283  0.97916667 0.         1.
  1.        ]
 [0.62068966 0.79534884 0.88207547 0.77083333 1.         0.
  0.        ]
 [0.82068966 0.85116279 0.91981132 0.80208333 1.         0.
  0.        ]
 [0.80517241 0.83255814 0.91981132 0.6875     1.         0.
  0.        ]
 [0.63103448 0.85581395 0.86792453 0.72395833 1.         0.
  0.        ]
 [0.72586207 0.88837209 0.91981132 0.8

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled)

[[-1.34082659  2.35505035  0.88068888  1.62029553 -1.14490646  1.80969611
   1.01770049]
 [ 1.80078227  0.5480021   1.30057122  0.57562238  0.8734338  -0.55257896
   1.01770049]
 [-0.40582395 -1.0008964   0.60076732 -0.35297599 -1.14490646  1.80969611
  -0.98260737]
 [ 1.68858195  0.11775252  1.30057122  1.73637033  0.8734338  -0.55257896
   1.01770049]
 [-0.069223    0.11775252  1.440532    0.40151018 -1.14490646  1.80969611
   1.01770049]
 [-0.83592516  1.83875085  1.02064966  0.16936059 -1.14490646  1.80969611
   1.01770049]
 [-0.7424249  -0.05434732 -2.47836983 -0.70120037 -1.14490646 -0.55257896
  -0.98260737]
 [-1.4530269  -1.08694632 -0.93880125 -0.70120037  0.8734338  -0.55257896
  -0.98260737]
 [ 0.21127779 -0.65669673  0.60076732  2.3167443  -1.14490646  1.80969611
   1.01770049]
 [-1.07902585 -1.08694632 -0.65887969 -0.0047516   0.8734338  -0.55257896
  -0.98260737]
 [ 1.09018027 -0.05434732  0.46080654  0.34347279  0.8734338  -0.55257896
  -0.98260737]
 [ 0.92187979 -0.3985

**The MaxAbsScalar keeps the same values 0 and 1 (binary values). However, the Standard Scalar scales those values as well and the data is no longer 0 or 1.**