<a href="https://colab.research.google.com/github/SanjayBukka/ML-Smart-Bot/blob/main/ML_smart_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ML Smart Bot

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import seaborn as sns
import matplotlib.pyplot as plt

# Set page configuration
st.set_page_config(page_title="Advanced ML Analysis App", layout="wide")

def load_data(file):
    """Load and prepare dataset"""
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file)
        elif file.name.endswith('.txt'):
            df = pd.read_csv(file, delimiter='\t')
        return df
    except Exception as e:
        st.error(f"Error loading file: {str(e)}")
        return None

def preprocess_data(df, numeric_features, categorical_features):
    """Preprocess the data with scaling and encoding"""
    df_processed = df.copy()

    # Scale numeric features
    if numeric_features:
        scaler = StandardScaler()
        df_processed[numeric_features] = scaler.fit_transform(df_processed[numeric_features])

    # Encode categorical features
    for col in categorical_features:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col])

    return df_processed


# Add this function after the preprocess_data function
def prepare_correlation_data(df, features, target_column):
    """Prepare data for correlation by encoding categorical variables"""
    df_corr = df.copy()

    # Encode all categorical columns including target if it's categorical
    for column in features + [target_column]:
        if df_corr[column].dtype == 'object':
            le = LabelEncoder()
            df_corr[column] = le.fit_transform(df_corr[column])

    return df_corr

# Replace the correlation matrix section (around line 247) with this:
try:
    # Prepare data for correlation
    df_corr = prepare_correlation_data(df, features, target_column)

    # Calculate and display correlation matrix
    st.write("#### Feature Correlation Matrix")
    corr = df_corr[features + [target_column]].corr()
    fig = px.imshow(corr,
                    title="Feature Correlation Matrix",
                    color_continuous_scale="RdBu")
    st.plotly_chart(fig)

    # Feature correlations with target
    correlations = df_corr[features + [target_column]].corr()[target_column].sort_values(ascending=False)
    st.write("#### Top Feature Correlations with Target Variable:")
    st.write(correlations[1:])
except Exception as e:
    st.warning("Could not calculate correlations for some categorical variables.")
    st.write("This usually happens with categorical variables that cannot be meaningfully correlated.")

def get_model(algorithm, is_regression=True):
    """Return the selected model based on problem type"""
    regression_models = {
        "Random Forest": RandomForestRegressor(),
        "Linear Regression": LinearRegression(),
        "Support Vector Machine": SVR(),
        "Decision Tree": DecisionTreeRegressor(),
        "K-Nearest Neighbors": KNeighborsRegressor()
    }

    classification_models = {
        "Random Forest": RandomForestClassifier(),
        "Logistic Regression": LogisticRegression(),
        "Support Vector Machine": SVC(probability=True),
        "Decision Tree": DecisionTreeClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier()
    }

    return regression_models[algorithm] if is_regression else classification_models[algorithm]

def plot_regression_results(y_test, predictions):
    """Plot actual vs predicted values for regression"""
    fig = px.scatter(x=y_test, y=predictions,
                    labels={'x': 'Actual Values', 'y': 'Predicted Values'},
                    title='Actual vs Predicted Values')
    fig.add_shape(type='line',
                 x0=min(y_test), y0=min(y_test),
                 x1=max(y_test), y1=max(y_test),
                 line=dict(color='red', dash='dash'))
    return fig

def plot_feature_importance(model, feature_names):
    """Plot feature importance for supported models"""
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importance = np.abs(model.coef_) if len(model.coef_.shape) == 1 else np.abs(model.coef_[0])
    else:
        return None

    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    }).sort_values('Importance', ascending=False)

    fig = px.bar(importance_df,
                 x='Feature',
                 y='Importance',
                 title='Feature Importance')
    return fig

# Main app title
st.title("🤖 Advanced Machine Learning Analysis Application")

# File upload section
st.header("1. Data Upload")
uploaded_file = st.file_uploader("Upload your dataset (CSV or TXT)", type=['csv', 'txt'])

if uploaded_file is not None:
    # Load data
    df = load_data(uploaded_file)

    if df is not None:
        # Data Preview
        st.header("2. Data Preview")
        st.write("#### First few rows of the dataset")
        st.dataframe(df.head())

        # Basic data info
        st.write("#### Dataset Information")
        col1, col2, col3 = st.columns(3)
        with col1:
            st.write(f"Rows: {df.shape[0]}")
        with col2:
            st.write(f"Columns: {df.shape[1]}")
        with col3:
            st.write(f"Missing values: {df.isnull().sum().sum()}")

        # Feature Selection
        st.header("3. Feature Selection")

        # Separate numeric and categorical columns
        numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

        # Target variable selection
        target_column = st.selectbox(
            "Select target variable",
            df.columns,
            help="This is the variable you want to predict"
        )

        # Determine if it's a regression or classification problem
        is_regression = df[target_column].dtype in ['int64', 'float64'] and len(df[target_column].unique()) > 10

        # Feature selection
        st.write("#### Select Features")
        numeric_features = st.multiselect(
            "Select numeric features",
            [col for col in numeric_columns if col != target_column],
            default=[col for col in numeric_columns if col != target_column][:3]
        )

        categorical_features = st.multiselect(
            "Select categorical features",
            [col for col in categorical_columns if col != target_column],
            default=[col for col in categorical_columns if col != target_column][:2]
        )

        # Model Selection and Parameters
        st.header("4. Model Configuration")

        col1, col2 = st.columns(2)
        with col1:
            if is_regression:
                algorithm = st.selectbox(
                    "Select ML Algorithm",
                    ["Random Forest", "Linear Regression", "Support Vector Machine",
                     "Decision Tree", "K-Nearest Neighbors"]
                )
            else:
                algorithm = st.selectbox(
                    "Select ML Algorithm",
                    ["Random Forest", "Logistic Regression", "Support Vector Machine",
                     "Decision Tree", "K-Nearest Neighbors"]
                )

        with col2:
            test_size = st.slider("Test Set Size (%)", 10, 50, 20)

        # Run Analysis button
        if st.button("🚀 Run Analysis"):
            st.header("5. Analysis Results")

            # Prepare features and target
            features = numeric_features + categorical_features
            X = df[features]
            y = df[target_column]

            # Preprocess data
            X_processed = preprocess_data(X, numeric_features, categorical_features)
            if not is_regression and y.dtype == 'object':
                le = LabelEncoder()
                y = le.fit_transform(y)

            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X_processed, y, test_size=test_size/100, random_state=42
            )

            # Train model
            model = get_model(algorithm, is_regression)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

            # Results visualization
            if is_regression:
                # Regression metrics
                mse = mean_squared_error(y_test, predictions)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, predictions)

                st.write("#### Model Performance Metrics")
                col1, col2, col3 = st.columns(3)
                with col1:
                    st.metric("R² Score", f"{r2:.3f}")
                with col2:
                    st.metric("RMSE", f"{rmse:.3f}")
                with col3:
                    st.metric("MSE", f"{mse:.3f}")

                # Actual vs Predicted Plot
                st.write("#### Actual vs Predicted Values")
                fig = plot_regression_results(y_test, predictions)
                st.plotly_chart(fig)
            else:
                # Classification metrics
                st.write("#### Confusion Matrix")
                conf_matrix = confusion_matrix(y_test, predictions)
                fig = px.imshow(conf_matrix,
                               labels=dict(x="Predicted", y="Actual"),
                               x=[str(i) for i in range(len(np.unique(y)))],
                               y=[str(i) for i in range(len(np.unique(y)))],
                               title="Confusion Matrix",
                               color_continuous_scale="Viridis")
                st.plotly_chart(fig)

                st.write("#### Classification Report")
                report = classification_report(y_test, predictions, output_dict=True)
                report_df = pd.DataFrame(report).transpose()
                st.dataframe(report_df)

            # Feature Importance
            st.write("#### Feature Importance")
            importance_fig = plot_feature_importance(model, features)
            if importance_fig:
                st.plotly_chart(importance_fig)

            # Correlation Matrix
            st.write("#### Feature Correlation Matrix")
            corr = df[features + [target_column]].corr()
            fig = px.imshow(corr,
                           title="Feature Correlation Matrix",
                           color_continuous_scale="RdBu")
            st.plotly_chart(fig)

            # Automated Insights
            st.header("6. Automated Insights")
            if is_regression:
                st.info(f"""
                📊 Model Performance Summary:
                - R² Score: {r2:.3f} (higher is better, 1 is perfect)
                - Root Mean Squared Error: {rmse:.3f}
                - Number of features used: {len(features)}
                - Training set size: {len(X_train)} samples
                - Test set size: {len(X_test)} samples
                """)
            else:
                accuracy = report['accuracy']
                st.info(f"""
                📊 Model Performance Summary:
                - Overall Accuracy: {accuracy:.3f}
                - Number of features used: {len(features)}
                - Training set size: {len(X_train)} samples
                - Test set size: {len(X_test)} samples
                """)

else:
    st.info("👆 Please upload a CSV or TXT file to begin the analysis")
    st.write("""
    #### This application supports:
    - Both regression and classification problems
    - Multiple ML algorithms
    - Automated data preprocessing
    - Feature importance analysis
    - Correlation analysis
    - Detailed performance metrics
    """)

Overwriting app.py


In [None]:
!pip install streamlit pandas numpy scikit-learn plotly seaborn matplotlib pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Downloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.2


In [None]:
# Kill any existing processes
!killall ngrok
!killall streamlit

# Start Streamlit
!streamlit run app.py &>/dev/null &

# Wait a few seconds for Streamlit to start
import time
time.sleep(3)

# Try connecting with ngrok again
from pyngrok import ngrok
ngrok.kill()  # Kill any existing ngrok processes
public_url = ngrok.connect(8501)
print(f"Public URL: {public_url}")



Public URL: NgrokTunnel: "https://a8dd-34-30-136-86.ngrok-free.app" -> "http://localhost:8501"
