In [1]:
pip install pycaret

Note: you may need to restart the kernel to use updated packages.


In [2]:
import streamlit as st
import pandas as pd
from pycaret.classification import setup as setup_clf, compare_models as compare_models_clf, pull as pull_clf
from pycaret.regression import setup as setup_reg, compare_models as compare_models_reg, pull as pull_reg
import io

def main():
    st.title('Streamlit EDA & Model Training App')

    st.write("""
    ### Instructions:
    1. Upload a CSV file.
    2. Optionally drop columns.
    3. Choose whether to perform EDA and specify columns.
    4. Specify how to handle missing values.
    5. Choose encoding method for categorical data.
    6. Select features (X) and target (Y).
    7. The app will automatically detect the task type and train models using PyCaret.
    """)

    # File uploader
    uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
    
    if uploaded_file is not None:
        data = pd.read_csv(uploaded_file)
        st.write("Data preview:")
        st.dataframe(data.head())
        
        # Drop columns
        columns_to_drop = st.multiselect("Select columns to drop", data.columns)
        if columns_to_drop:
            data = data.drop(columns=columns_to_drop)
            st.write("Updated Data preview:")
            st.dataframe(data.head())

        # Perform EDA
        if st.checkbox("Perform EDA?"):
            eda_columns = st.multiselect("Select columns for EDA", data.columns)
            if eda_columns:
                st.write("EDA Summary:")
                st.write(data[eda_columns].describe())

        # Handle missing values
        st.write("Handling Missing Values:")
        for col in data.columns:
            if data[col].isnull().sum() > 0:
                if data[col].dtype == 'object':
                    impute_method = st.selectbox(f"Select method to impute missing values in '{col}'", ['Mode', 'Additional Class'], key=col)
                    if impute_method == 'Mode':
                        data[col] = data[col].fillna(data[col].mode()[0])
                    else:
                        data[col] = data[col].fillna('Missing')
                else:
                    impute_method = st.selectbox(f"Select method to impute missing values in '{col}'", ['Mean', 'Median', 'Mode'], key=col)
                    if impute_method == 'Mean':
                        data[col] = data[col].fillna(data[col].mean())
                    elif impute_method == 'Median':
                        data[col] = data[col].fillna(data[col].median())
                    else:
                        data[col] = data[col].fillna(data[col].mode()[0])

        # Encode categorical features
        encoding_method = st.selectbox("Select method to encode categorical features", ['One Hot Encoding', 'Label Encoding'])
        if encoding_method == 'One Hot Encoding':
            data = pd.get_dummies(data, drop_first=True)
        else:
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            for col in data.columns:
                if data[col].dtype == 'object':
                    data[col] = le.fit_transform(data[col])

        st.write("Updated Data preview after encoding and imputing missing values:")
        st.dataframe(data.head())

        # Select features and target
        target = st.selectbox("Select Target Variable", data.columns)
        features = st.multiselect("Select Feature Variables", [col for col in data.columns if col != target])

        if features and target:
            X = data[features]
            y = data[target]
            
            # Detect task type
            if y.nunique() <= 10 and y.dtype == 'int':
                task_type = 'Classification'
            else:
                task_type = 'Regression'
            
            # Train models using PyCaret
            st.write(f"Detected task type: {task_type}")
            
            if st.button("Run PyCaret"):
                if task_type == 'Classification':
                    clf = setup_clf(data, target=target, silent=True, html=False)
                    best_model = compare_models_clf()
                    results = pull_clf()
                else:
                    reg = setup_reg(data, target=target, silent=True, html=False)
                    best_model = compare_models_reg()
                    results = pull_reg()
                
                st.write("Model Comparison Results:")
                st.dataframe(results)

if __name__ == "__main__":
    main()


2024-07-26 15:26:48.052 
  command:

    streamlit run C:\Users\User\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
