**Mount Drive**

# **Mount Drive**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/Workshop698/LabTest

/content/drive/MyDrive/Workshop698/LabTest


# **Create Model**

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load data
data = pd.read_csv('penguins_size.csv')

# Data Cleansing
# Check for missing values
print(data.isnull().sum())

# Handle missing data
# Impute missing categorical columns (species, sex, island) with most frequent value
categorical_columns = ['species', 'island', 'sex']
imputer_cat = SimpleImputer(strategy='most_frequent')
data[categorical_columns] = imputer_cat.fit_transform(data[categorical_columns])

# Impute missing numeric columns (culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g) with mean
numeric_columns = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
imputer_num = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer_num.fit_transform(data[numeric_columns])

# Check if there are any remaining missing values
print(data.isnull().sum())

# Convert categorical variables to numeric (e.g., species, sex, island)
le_species = LabelEncoder()
data['species'] = le_species.fit_transform(data['species'])

le_sex = LabelEncoder()
data['sex'] = le_sex.fit_transform(data['sex'])

le_island = LabelEncoder()
data['island'] = le_island.fit_transform(data['island'])

# Feature selection (X) and target (y)
X = data[['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex', 'island']]
y = data['species']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}
grid_search_lr = GridSearchCV(estimator=model_lr, param_grid=param_grid_lr, cv=5, n_jobs=-1)
grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_

# 2. Random Forest Classifier
model_rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_

# 3. XGBoost Classifier
model_xgb = XGBClassifier(random_state=42)
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 10]
}
grid_search_xgb = GridSearchCV(estimator=model_xgb, param_grid=param_grid_xgb, cv=5, n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)
best_xgb = grid_search_xgb.best_estimator_

# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

# Evaluate each model
print("Logistic Regression Evaluation:")
evaluate_model(best_lr, X_test, y_test)

print("Random Forest Classifier Evaluation:")
evaluate_model(best_rf, X_test, y_test)

print("XGBoost Classifier Evaluation:")
evaluate_model(best_xgb, X_test, y_test)


species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64
species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64
Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        16
           2       1.00      1.00      1.00        21

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

Random Forest Classifier Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        16
           2       1.00      1.0

**จากผลที่ได้มี 2 Model ที่ได้ผลดีเหมือนกัน ดังนั้นจึงเลือกโมเดล Logistic Regression เพื่อนำไป deploy ต่อ**

# **Save Recommendation Model & Data**

In [7]:
import pickle
from sklearn.preprocessing import LabelEncoder

# บันทึกโมเดล best_lr ด้วย pickle
with open('best_logistic_regression_model.pkl', 'wb') as model_file:
    pickle.dump(best_lr, model_file)

# บันทึก LabelEncoder สำหรับ 'sex' และ 'island'
with open('le_sex.pkl', 'wb') as le_sex_file:
    pickle.dump(le_sex, le_sex_file)

with open('le_island.pkl', 'wb') as le_island_file:
    pickle.dump(le_island, le_island_file)

# **Write file for deploy streamlit**

In [8]:
%%writefile app_penguins_rf_model.py

# Import necessary libraries
import streamlit as st
import pickle
import numpy as np
import os

# Set page configuration
st.set_page_config(page_title="Penguin Species Predictor", page_icon="🐧", layout="centered")

# Set background image URL
background_image_url = "https://images.pexels.com/photos/300857/pexels-photo-300857.jpeg"

# Set desired colors
text_color = "#FFA500"  # Text color
result_bg_color = "#FFFAFA"  # Result background color

# Apply CSS for background and text colors
st.markdown(
    f"""
    <style>
    .stApp {{
        background-image: url('{background_image_url}');
        background-size: cover;
        background-position: center;
        height: 100vh;
    }}
    h1, h2, h3, p, div {{
        color: {text_color} !important;
    }}
    .result-container {{
        background-color: {result_bg_color};
        border-radius: 10px;
        padding: 20px;
        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
        opacity: 0.9;
        border: 2px solid {text_color};
    }}
    .image-container {{
        display: flex;
        justify-content: center;
        align-items: center;
        margin: 20px 0; /* Add margin for spacing */
    }}
    </style>
    """,
    unsafe_allow_html=True
)

# Load the trained Random Forest model
model_path = 'penguins_rf_model.pkl'

# Ensure the model file exists
if os.path.exists(model_path):
    with open(model_path, 'rb') as file:
        rf_model = pickle.load(file)
else:
    st.error(f"Error: {model_path} not found. Please make sure the model file is available.")
    st.stop()

# Streamlit App
st.title("🐧 Penguin Species Prediction ")
st.write("This app predicts the **species** of a penguin based on its physical characteristics. "
         "Please enter the details on the left and click 'Predict Species' to see the result.")

# Sidebar inputs for user to input penguin features
st.sidebar.header("Penguin Features")
st.sidebar.write("Provide the following features to predict the penguin's species:")

# Input options in sidebar with descriptions
island = st.sidebar.selectbox("Island", ['Biscoe', 'Dream', 'Torgersen'])
bill_length_mm = st.sidebar.slider("Bill Length (mm)", 32.1, 59.6, 45.0)
bill_depth_mm = st.sidebar.slider("Bill Depth (mm)", 13.1, 21.5, 17.2)
flipper_length_mm = st.sidebar.slider("Flipper Length (mm)", 172.0, 231.0, 200.0)
body_mass_g = st.sidebar.slider("Body Mass (g)", 2700, 6300, 4200)
sex = st.sidebar.selectbox("Sex", ['Male', 'Female'])

# Move the Predict button to the sidebar
if st.sidebar.button("Predict Species"):
    # Map the island and sex inputs to their encoded values
    island_map = {'Biscoe': 0, 'Dream': 1, 'Torgersen': 2}
    sex_map = {'Male': 0, 'Female': 1}

    # Convert inputs to model-compatible format
    input_data = np.array([[island_map[island], bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, sex_map[sex]]])

    # Predict species
    prediction = rf_model.predict(input_data)
    species_dict = {0: "Adelie", 1: "Chinstrap", 2: "Gentoo"}  # Mapping encoded values to species names
    species = species_dict[prediction[0]]

    # Display results with style
    st.subheader("Prediction Result")

    # Using markdown for better styling
    st.markdown(
        f"<div class='result-container'>"
        f"<h3 style='color: {text_color};'>The predicted species of the penguin is: <strong>{species}</strong></h3>"
        "</div>",
        unsafe_allow_html=True
    )

    # Show species image in the center
    species_image_path = f"images/{species.lower()}.jpg"  # Ensure images are in the 'images' directory
    if os.path.exists(species_image_path):
        st.image(species_image_path, width=300, caption=f"{species} Penguin", use_column_width='auto')
    else:
        st.warning(f"Image for {species} not found. Please ensure the image is in the 'images' folder.")

    # Display characteristics with a result container
    st.markdown(
        f"<div class='result-container'>"
        f"<h4 style='color: {text_color};'>Penguin Species Characteristics</h4>"
        f"<p style='color: {text_color};'>- Island: {island}</p>"
        f"<p style='color: {text_color};'>- Bill Length: {bill_length_mm} mm</p>"
        f"<p style='color: {text_color};'>- Bill Depth: {bill_depth_mm} mm</p>"
        f"<p style='color: {text_color};'>- Flipper Length: {flipper_length_mm} mm</p>"
        f"<p style='color: {text_color};'>- Body Mass: {body_mass_g} g</p>"
        f"<p style='color: {text_color};'>- Sex: {sex}</p>"
        "</div>",
        unsafe_allow_html=True
    )
else:
    st.info("Please enter the penguin features on the left and click 'Predict Species'.")

Writing app_penguins_rf_model.py


In [9]:
%%writefile requirements.txt

scikit-learn
pandas
cloudpickle
streamlit
numpy
Pillow

Writing requirements.txt
