<a href="https://colab.research.google.com/github/Upanshi-2285/Color_Detector/blob/main/EMPLOYEE_SALARY_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.47.0-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m405.7 kB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hIns

In [None]:
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np # Used for potential fallback data if CSV not found

# --- 1. Data Loading and Preparation ---
@st.cache_data # Cache the data loading to avoid re-running on every interaction
def load_data():
    """Loads the ds_salaries.csv dataset."""
    try:
        df = pd.read_csv('/content/ds_salaries.csv')
        st.success("Dataset 'ds_salaries.csv' loaded successfully.")
        return df
    except FileNotFoundError:
        st.error("Error: 'ds_salaries.csv' not found. Please ensure the file is in the same directory as this script.")
        st.info("Falling back to synthetic data for demonstration purposes.")
        # Fallback to synthetic data if file not found
        def generate_synthetic_data(num_samples=500):
            np.random.seed(42)
            experience_levels = np.random.choice(['EN', 'MI', 'SE', 'EX'], num_samples, p=[0.2, 0.3, 0.4, 0.1])
            job_titles = np.random.choice([
                'Data Scientist', 'Machine Learning Engineer', 'Data Analyst',
                'AI Engineer', 'Data Engineer', 'Research Scientist'
            ], num_samples, p=[0.25, 0.20, 0.20, 0.15, 0.10, 0.10])
            company_locations = np.random.choice(['US', 'GB', 'CA', 'IN', 'DE', 'FR'], num_samples, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1])
            salary_in_usd = 60000 + \
                            np.where(experience_levels == 'MI', 30000, 0) + \
                            np.where(experience_levels == 'SE', 70000, 0) + \
                            np.where(experience_levels == 'EX', 120000, 0) + \
                            np.where(job_titles == 'Machine Learning Engineer', 20000, 0) + \
                            np.where(job_titles == 'Data Scientist', 15000, 0) + \
                            np.where(job_titles == 'Data Engineer', 10000, 0) + \
                            np.where(company_locations == 'US', 40000, 0) + \
                            np.where(company_locations == 'GB', 20000, 0) + \
                            np.random.normal(0, 15000, num_samples)
            salary_in_usd = np.maximum(30000, salary_in_usd).round(0)
            data = pd.DataFrame({
                'experience_level': experience_levels,
                'job_title': job_titles,
                'company_location': company_locations,
                'salary_in_usd': salary_in_usd
            })
            return data
        return generate_synthetic_data()

df = load_data()

# --- 2. Model Training ---
@st.cache_resource # Cache the model training to avoid re-training on every interaction
def train_model(dataframe):
    """Trains a Random Forest Regressor model using the provided DataFrame."""
    # Define features (X) and target (y) based on ds_salaries.csv
    X = dataframe[['experience_level', 'job_title', 'company_location']]
    y = dataframe['salary_in_usd']

    # Define categorical features (all selected features are categorical in this dataset)
    categorical_features = ['experience_level', 'job_title', 'company_location']
    numerical_features = [] # No numerical features directly used from this selection

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Create a pipeline that first preprocesses the data and then trains the model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

    # Train the model
    model_pipeline.fit(X, y)
    return model_pipeline, X.columns # Return model and feature names for consistent input

model, feature_columns = train_model(df)

# --- 3. Streamlit UI ---
st.set_page_config(layout="centered", page_title="Data Science Salary Predictor", page_icon="💰")

# Custom CSS for styling
st.markdown(
    """
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

    html, body, [class*="st-"] {
        font-family: 'Inter', sans-serif;
    }

    .main {
        background-color: #f8f9fa;
        padding: 30px;
        border-radius: 12px;
        box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1);
        max-width: 800px;
        margin: auto;
    }
    .stButton>button {
        background-color: #007bff; /* Blue */
        color: white;
        font-weight: bold;
        border-radius: 10px;
        border: none;
        padding: 12px 25px;
        cursor: pointer;
        transition: background-color 0.3s ease, transform 0.2s ease;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
        width: 100%;
        font-size: 1.1em;
    }
    .stButton>button:hover {
        background-color: #0056b3; /* Darker blue */
        transform: translateY(-2px);
    }
    .stTextInput>div>div>input, .stSelectbox>div>div>select {
        border-radius: 8px;
        border: 1px solid #ced4da;
        padding: 10px;
        box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.05);
    }
    .stMarkdown h1 {
        color: #343a40;
        text-align: center;
        font-family: 'Inter', sans-serif;
        margin-bottom: 25px;
        font-size: 2.5em;
        font-weight: 700;
    }
    .stMarkdown h2 {
        color: #495057;
        font-family: 'Inter', sans-serif;
        margin-top: 35px;
        font-size: 1.8em;
        font-weight: 600;
    }
    .prediction-box {
        background-color: #e9f7ef; /* Light green */
        border: 2px solid #28a745; /* Green */
        border-radius: 12px;
        padding: 25px;
        text-align: center;
        margin-top: 40px;
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
    }
    .prediction-text {
        font-size: 2.5em;
        font-weight: bold;
        color: #28a745; /* Green */
        margin-top: 10px;
        margin-bottom: 10px;
    }
    .stInfo, .stSuccess, .stError {
        border-radius: 8px;
        padding: 10px;
        margin-bottom: 15px;
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("💰 Data Science Salary Predictor")
st.write("Enter the details below to get an estimated salary based on the `ds_salaries.csv` dataset.")

# Input fields for user
col1, col2 = st.columns(2)

with col1:
    experience_level = st.selectbox(
        "Experience Level",
        options=df['experience_level'].unique(),
        index=df['experience_level'].value_counts().index.tolist().index('SE') if 'SE' in df['experience_level'].unique() else 0, # Default to 'SE' (Senior) if available
        help="Select your professional experience level (EN: Entry, MI: Mid, SE: Senior, EX: Executive)."
    )

with col2:
    company_location = st.selectbox(
        "Company Location (Country Code)",
        options=df['company_location'].unique(),
        index=df['company_location'].value_counts().index.tolist().index('US') if 'US' in df['company_location'].unique() else 0, # Default to 'US' if available
        help="Select the country where the company is located (e.g., US, GB, IN)."
    )

job_title = st.selectbox(
    "Job Title",
    options=df['job_title'].unique(),
    index=df['job_title'].value_counts().index.tolist().index('Data Scientist') if 'Data Scientist' in df['job_title'].unique() else 0, # Default to 'Data Scientist' if available
    help="Choose your specific job title in the data science field."
)


# Prediction button
if st.button("Predict Salary"):
    # Create a DataFrame from user inputs, ensuring column order matches training
    input_data = pd.DataFrame([[experience_level, job_title, company_location]],
                              columns=feature_columns) # Use feature_columns from training

    # Make prediction
    predicted_salary = model.predict(input_data)[0]

    # Display prediction
    st.markdown(
        f"""
        <div class="prediction-box">
            <h2>Estimated Salary:</h2>
            <p class="prediction-text">${predicted_salary:,.2f} USD</p>
            <p><i>(This is an estimated salary based on the trained model.)</i></p>
        </div>
        """,
        unsafe_allow_html=True
    )

st.markdown("---")
st.write("### About this Model:")
st.write(f"**Dataset used:** `ds_salaries.csv` (contains {len(df)} records)")
st.write(f"**Features for prediction:** Experience Level, Job Title, Company Location")
st.write(f"**Machine Learning Algorithm:** Random Forest Regressor")
st.write("*(Disclaimer: This model is for demonstration and estimation purposes only. Real-world salaries can vary significantly based on many factors not included in this simplified model.)*")


2025-07-18 14:50:33.508 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-07-18 14:50:34.008 Session state does not function when running a script without `streamlit run`


In [None]:
%%writefile salary_app.py
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np # Used for potential fallback data if CSV not found

# --- 1. Data Loading and Preparation ---
@st.cache_data # Cache the data loading to avoid re-running on every interaction
def load_data():
    """Loads the ds_salaries.csv dataset."""
    try:
        df = pd.read_csv('ds_salaries.csv')
        st.success("Dataset 'ds_salaries.csv' loaded successfully.")
        return df
    except FileNotFoundError:
        st.error("Error: 'ds_salaries.csv' not found. Please ensure the file is in the same directory as this script.")
        st.info("Falling back to synthetic data for demonstration purposes.")
        # Fallback to synthetic data if file not found
        def generate_synthetic_data(num_samples=500):
            np.random.seed(42)
            experience_levels = np.random.choice(['EN', 'MI', 'SE', 'EX'], num_samples, p=[0.2, 0.3, 0.4, 0.1])
            job_titles = np.random.choice([
                'Data Scientist', 'Machine Learning Engineer', 'Data Analyst',
                'AI Engineer', 'Data Engineer', 'Research Scientist'
            ], num_samples, p=[0.25, 0.20, 0.20, 0.15, 0.10, 0.10])
            company_locations = np.random.choice(['US', 'GB', 'CA', 'IN', 'DE', 'FR'], num_samples, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1])
            salary_in_usd = 60000 + \
                            np.where(experience_levels == 'MI', 30000, 0) + \
                            np.where(experience_levels == 'SE', 70000, 0) + \
                            np.where(experience_levels == 'EX', 120000, 0) + \
                            np.where(job_titles == 'Machine Learning Engineer', 20000, 0) + \
                            np.where(job_titles == 'Data Scientist', 15000, 0) + \
                            np.where(job_titles == 'Data Engineer', 10000, 0) + \
                            np.where(company_locations == 'US', 40000, 0) + \
                            np.where(company_locations == 'GB', 20000, 0) + \
                            np.random.normal(0, 15000, num_samples)
            salary_in_usd = np.maximum(30000, salary_in_usd).round(0)
            data = pd.DataFrame({
                'experience_level': experience_levels,
                'job_title': job_titles,
                'company_location': company_locations,
                'salary_in_usd': salary_in_usd
            })
            return data
        return generate_synthetic_data()

df = load_data()

# --- 2. Model Training ---
@st.cache_resource # Cache the model training to avoid re-training on every interaction
def train_model(dataframe):
    """Trains a Random Forest Regressor model using the provided DataFrame."""
    # Define features (X) and target (y) based on ds_salaries.csv
    X = dataframe[['experience_level', 'job_title', 'company_location']]
    y = dataframe['salary_in_usd']

    # Define categorical features (all selected features are categorical in this dataset)
    categorical_features = ['experience_level', 'job_title', 'company_location']
    numerical_features = [] # No numerical features directly used from this selection

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Create a pipeline that first preprocesses the data and then trains the model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

    # Train the model
    model_pipeline.fit(X, y)
    return model_pipeline, X.columns # Return model and feature names for consistent input

model, feature_columns = train_model(df)

# --- 3. Streamlit UI ---
st.set_page_config(layout="centered", page_title="Data Science Salary Predictor", page_icon="💰")

# Custom CSS for styling
st.markdown(
    """
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

    html, body, [class*="st-"] {
        font-family: 'Inter', sans-serif;
    }

    .main {
        background-color: #f8f9fa;
        padding: 30px;
        border-radius: 12px;
        box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1);
        max-width: 800px;
        margin: auto;
    }
    .stButton>button {
        background-color: #007bff; /* Blue */
        color: white;
        font-weight: bold;
        border-radius: 10px;
        border: none;
        padding: 12px 25px;
        cursor: pointer;
        transition: background-color 0.3s ease, transform 0.2s ease;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
        width: 100%;
        font-size: 1.1em;
    }
    .stButton>button:hover {
        background-color: #0056b3; /* Darker blue */
        transform: translateY(-2px);
    }
    .stTextInput>div>div>input, .stSelectbox>div>div>select {
        border-radius: 8px;
        border: 1px solid #ced4da;
        padding: 10px;
        box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.05);
    }
    .stMarkdown h1 {
        color: #343a40;
        text-align: center;
        font-family: 'Inter', sans-serif;
        margin-bottom: 25px;
        font-size: 2.5em;
        font-weight: 700;
    }
    .stMarkdown h2 {
        color: #495057;
        font-family: 'Inter', sans-serif;
        margin-top: 35px;
        font-size: 1.8em;
        font-weight: 600;
    }
    .prediction-box {
        background-color: #e9f7ef; /* Light green */
        border: 2px solid #28a745; /* Green */
        border-radius: 12px;
        padding: 25px;
        text-align: center;
        margin-top: 40px;
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
    }
    .prediction-text {
        font-size: 2.5em;
        font-weight: bold;
        color: #28a745; /* Green */
        margin-top: 10px;
        margin-bottom: 10px;
    }
    .stInfo, .stSuccess, .stError {
        border-radius: 8px;
        padding: 10px;
        margin-bottom: 15px;
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("💰 Data Science Salary Predictor")
st.write("Enter the details below to get an estimated salary based on the `ds_salaries.csv` dataset.")

# Input fields for user
col1, col2 = st.columns(2)

with col1:
    experience_level = st.selectbox(
        "Experience Level",
        options=df['experience_level'].unique(),
        index=df['experience_level'].value_counts().index.tolist().index('SE') if 'SE' in df['experience_level'].unique() else 0, # Default to 'SE' (Senior) if available
        help="Select your professional experience level (EN: Entry, MI: Mid, SE: Senior, EX: Executive)."
    )

with col2:
    company_location = st.selectbox(
        "Company Location (Country Code)",
        options=df['company_location'].unique(),
        index=df['company_location'].value_counts().index.tolist().index('US') if 'US' in df['company_location'].unique() else 0, # Default to 'US' if available
        help="Select the country where the company is located (e.g., US, GB, IN)."
    )

job_title = st.selectbox(
    "Job Title",
    options=df['job_title'].unique(),
    index=df['job_title'].value_counts().index.tolist().index('Data Scientist') if 'Data Scientist' in df['job_title'].unique() else 0, # Default to 'Data Scientist' if available
    help="Choose your specific job title in the data science field."
)


# Prediction button
if st.button("Predict Salary"):
    # Create a DataFrame from user inputs, ensuring column order matches training
    input_data = pd.DataFrame([[experience_level, job_title, company_location]],
                              columns=feature_columns) # Use feature_columns from training

    # Make prediction
    predicted_salary = model.predict(input_data)[0]

    # Display prediction
    st.markdown(
        f"""
        <div class="prediction-box">
            <h2>Estimated Salary:</h2>
            <p class="prediction-text">${predicted_salary:,.2f} USD</p>
            <p><i>(This is an estimated salary based on the trained model.)</i></p>
        </div>
        """,
        unsafe_allow_html=True
    )

st.markdown("---")
st.write("### About this Model:")
st.write(f"**Dataset used:** `ds_salaries.csv` (contains {len(df)} records)")
st.write(f"**Features for prediction:** Experience Level, Job Title, Company Location")
st.write(f"**Machine Learning Algorithm:** Random Forest Regressor")
st.write("*(Disclaimer: This model is for demonstration and estimation purposes only. Real-world salaries can vary significantly based on many factors not included in this simplified model.)*")

Writing salary_app.py


In [None]:
from pyngrok import ngrok
import os

# Optional: Authenticate ngrok for more stable tunnels.
# Get your authtoken from https://dashboard.ngrok.com/auth/your-authtoken
# os.environ["NGROK_AUTH_TOKEN"] = "YOUR_NGROK_AUTH_TOKEN" # Uncomment and replace with your token

# Kill any existing ngrok tunnels to avoid conflicts
ngrok.kill()

# Start ngrok tunnel on port 8501 (Streamlit's default port)
public_url = ngrok.connect(addr="8501", proto="http")
print(f"Streamlit App URL: {public_url}")

# Run the Streamlit app in the background
# The `nohup` and `&` keep it running even if the cell finishes execution
# `--server.enableCORS false --server.enableXsrfProtection false` are often needed for Colab
!nohup streamlit run salary_app.py --server.port 8501 --server.enableCORS false --server.enableXsrfProtection false > /dev/null 2>&1 &

ERROR:pyngrok.process.ngrok:t=2025-07-18T15:06:50+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"


PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.

In [None]:
# Optional: Authenticate ngrok for more stable tunnels.
# Get your authtoken from https://dashboard.ngrok.com/auth/your-authtoken
os.environ["2zgcKtvuKihnzmipZi9hXDauRiQ_72okmhVB95A44Ue8KNxTG"] = "2zgcKtvuKihnzmipZi9hXDauRiQ_72okmhVB95A44Ue8KNxTG" # Uncomment and replace with your token

In [None]:
er.enableCORS false --server.enableXsrfProtection false > /dev/null 2>&1 &

ERROR:pyngrok.process.ngrok:t=2025-07-18T15:24:51+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-07-18T15:24:51+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"


PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.

In [None]:
from pyngrok import ngrok
import os

# Ensure the ngrok authtoken is set.
# This line MUST NOT have a '#' at the very beginning.
# Replace "YOUR_NGROK_AUTH_TOKEN_HERE" with your actual token.
os.environ["NGROK_AUTH_TOKEN"] = "2zgcKtvuKihnzmipZi9hXDauRiQ_72okmhVB95A44Ue8KNxTG"

# --- Verification Step (Optional, but helpful for debugging) ---
# You can add this line to confirm the environment variable is set.
# It should print your token if set correctly, or None if not.
print(f"NGROK_AUTH_TOKEN environment variable: {os.getenv('NGROK_AUTH_TOKEN')}")
# --- End Verification Step ---

# Kill any existing ngrok tunnels to avoid conflicts
print("Killing any existing ngrok tunnels...")
ngrok.kill()
print("Existing tunnels killed.")

# Start ngrok tunnel on port 8501 (Streamlit's default port)
print("Attempting to connect ngrok tunnel...")
try:
    public_url = ngrok.connect(addr="8501", proto="http")
    print(f"Streamlit App URL: {public_url}")

    # Run the Streamlit app in the background
    print("Starting Streamlit app in background...")
    !nohup streamlit run salary_app.py --server.port 8501 --server.enableCORS false --server.enableXsrfProtection false > /dev/null 2>&1 &
    print("Streamlit app command sent to background.")

except Exception as e:
    print(f"An error occurred during ngrok connection: {e}")
    print("Please double-check your ngrok authtoken and ensure it's correctly set and uncommented.")

NGROK_AUTH_TOKEN environment variable: 2zgcKtvuKihnzmipZi9hXDauRiQ_72okmhVB95A44Ue8KNxTG
Killing any existing ngrok tunnels...
Existing tunnels killed.
Attempting to connect ngrok tunnel...


ERROR:pyngrok.process.ngrok:t=2025-07-18T16:14:34+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-07-18T16:14:34+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"


An error occurred during ngrok connection: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.
Please double-check your ngrok authtoken and ensure it's correctly set and uncommented.


In [None]:
!pip install streamlit
!npm install -g localtunnel # Install localtunnel globally as a Node.js package

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K
added 22 packages in 3s
[1G[0K⠙[1G[0K
[1G[0K⠙[1G[0K3 packages are looking for funding
[1G[0K⠙[1G[0K  run `npm fund` for details
[1G[0K⠙[1G[0K

In [None]:
%%writefile salary_app.py
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np # Used for potential fallback data if CSV not found

# --- 1. Data Loading and Preparation ---
@st.cache_data # Cache the data loading to avoid re-running on every interaction
def load_data():
    """Loads the ds_salaries.csv dataset."""
    try:
        df = pd.read_csv('/content/ds_salaries.csv')
        st.success("Dataset '/content/ds_salaries.csv' loaded successfully.")
        return df
    except FileNotFoundError:
        st.error("Error: 'ds_salaries.csv' not found. Please ensure the file is in the same directory as this script.")
        st.info("Falling back to synthetic data for demonstration purposes.")
        # Fallback to synthetic data if file not found
        def generate_synthetic_data(num_samples=500):
            np.random.seed(42)
            experience_levels = np.random.choice(['EN', 'MI', 'SE', 'EX'], num_samples, p=[0.2, 0.3, 0.4, 0.1])
            job_titles = np.random.choice([
                'Data Scientist', 'Machine Learning Engineer', 'Data Analyst',
                'AI Engineer', 'Data Engineer', 'Research Scientist'
            ], num_samples, p=[0.25, 0.20, 0.20, 0.15, 0.10, 0.10])
            company_locations = np.random.choice(['US', 'GB', 'CA', 'IN', 'DE', 'FR'], num_samples, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1])
            salary_in_usd = 60000 + \
                            np.where(experience_levels == 'MI', 30000, 0) + \
                            np.where(experience_levels == 'SE', 70000, 0) + \
                            np.where(experience_levels == 'EX', 120000, 0) + \
                            np.where(job_titles == 'Machine Learning Engineer', 20000, 0) + \
                            np.where(job_titles == 'Data Scientist', 15000, 0) + \
                            np.where(job_titles == 'Data Engineer', 10000, 0) + \
                            np.where(company_locations == 'US', 40000, 0) + \
                            np.where(company_locations == 'GB', 20000, 0) + \
                            np.random.normal(0, 15000, num_samples)
            salary_in_usd = np.maximum(30000, salary_in_usd).round(0)
            data = pd.DataFrame({
                'experience_level': experience_levels,
                'job_title': job_titles,
                'company_location': company_locations,
                'salary_in_usd': salary_in_usd
            })
            return data
        return generate_synthetic_data()

df = load_data()

# --- 2. Model Training ---
@st.cache_resource # Cache the model training to avoid re-training on every interaction
def train_model(dataframe):
    """Trains a Random Forest Regressor model using the provided DataFrame."""
    # Define features (X) and target (y) based on ds_salaries.csv
    X = dataframe[['experience_level', 'job_title', 'company_location']]
    y = dataframe['salary_in_usd']

    # Define categorical features (all selected features are categorical in this dataset)
    categorical_features = ['experience_level', 'job_title', 'company_location']
    numerical_features = [] # No numerical features directly used from this selection

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Create a pipeline that first preprocesses the data and then trains the model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

    # Train the model
    model_pipeline.fit(X, y)
    return model_pipeline, X.columns # Return model and feature names for consistent input

model, feature_columns = train_model(df)

# --- 3. Streamlit UI ---
st.set_page_config(layout="centered", page_title="Data Science Salary Predictor", page_icon="💰")

# Custom CSS for styling
st.markdown(
    """
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

    html, body, [class*="st-"] {
        font-family: 'Inter', sans-serif;
    }

    .main {
        background-color: #f8f9fa;
        padding: 30px;
        border-radius: 12px;
        box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1);
        max-width: 800px;
        margin: auto;
    }
    .stButton>button {
        background-color: #007bff; /* Blue */
        color: white;
        font-weight: bold;
        border-radius: 10px;
        border: none;
        padding: 12px 25px;
        cursor: pointer;
        transition: background-color 0.3s ease, transform 0.2s ease;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
        width: 100%;
        font-size: 1.1em;
    }
    .stButton>button:hover {
        background-color: #0056b3; /* Darker blue */
        transform: translateY(-2px);
    }
    .stTextInput>div>div>input, .stSelectbox>div>div>select {
        border-radius: 8px;
        border: 1px solid #ced4da;
        padding: 10px;
        box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.05);
    }
    .stMarkdown h1 {
        color: #343a40;
        text-align: center;
        font-family: 'Inter', sans-serif;
        margin-bottom: 25px;
        font-size: 2.5em;
        font-weight: 700;
    }
    .stMarkdown h2 {
        color: #495057;
        font-family: 'Inter', sans-serif;
        margin-top: 35px;
        font-size: 1.8em;
        font-weight: 600;
    }
    .prediction-box {
        background-color: #e9f7ef; /* Light green */
        border: 2px solid #28a745; /* Green */
        border-radius: 12px;
        padding: 25px;
        text-align: center;
        margin-top: 40px;
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
    }
    .prediction-text {
        font-size: 2.5em;
        font-weight: bold;
        color: #28a745; /* Green */
        margin-top: 10px;
        margin-bottom: 10px;
    }
    .stInfo, .stSuccess, .stError {
        border-radius: 8px;
        padding: 10px;
        margin-bottom: 15px;
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("💰 Data Science Salary Predictor")
st.write("Enter the details below to get an estimated salary based on the `ds_salaries.csv` dataset.")

# Input fields for user
col1, col2 = st.columns(2)

with col1:
    experience_level = st.selectbox(
        "Experience Level",
        options=df['experience_level'].unique(),
        index=df['experience_level'].value_counts().index.tolist().index('SE') if 'SE' in df['experience_level'].unique() else 0, # Default to 'SE' (Senior) if available
        help="Select your professional experience level (EN: Entry, MI: Mid, SE: Senior, EX: Executive)."
    )

with col2:
    company_location = st.selectbox(
        "Company Location (Country Code)",
        options=df['company_location'].unique(),
        index=df['company_location'].value_counts().index.tolist().index('US') if 'US' in df['company_location'].unique() else 0, # Default to 'US' if available
        help="Select the country where the company is located (e.g., US, GB, IN)."
    )

job_title = st.selectbox(
    "Job Title",
    options=df['job_title'].unique(),
    index=df['job_title'].value_counts().index.tolist().index('Data Scientist') if 'Data Scientist' in df['job_title'].unique() else 0, # Default to 'Data Scientist' if available
    help="Choose your specific job title in the data science field."
)


# Prediction button
if st.button("Predict Salary"):
    # Create a DataFrame from user inputs, ensuring column order matches training
    input_data = pd.DataFrame([[experience_level, job_title, company_location]],
                              columns=feature_columns) # Use feature_columns from training

    # Make prediction
    predicted_salary = model.predict(input_data)[0]

    # Display prediction
    st.markdown(
        f"""
        <div class="prediction-box">
            <h2>Estimated Salary:</h2>
            <p class="prediction-text">${predicted_salary:,.2f} USD</p>
            <p><i>(This is an estimated salary based on the trained model.)</i></p>
        </div>
        """,
        unsafe_allow_html=True
    )

st.markdown("---")
st.write("### About this Model:")
st.write(f"**Dataset used:** `ds_salaries.csv` (contains {len(df)} records)")
st.write(f"**Features for prediction:** Experience Level, Job Title, Company Location")
st.write(f"**Machine Learning Algorithm:** Random Forest Regressor")
st.write("*(Disclaimer: This model is for demonstration and estimation purposes only. Real-world salaries can vary significantly based on many factors not included in this simplified model.)*")

Overwriting salary_app.py


In [None]:
%%writefile salary_app.py
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np # Used for potential fallback data if CSV not found

# --- 1. Data Loading and Preparation ---
@st.cache_data # Cache the data loading to avoid re-running on every interaction
def load_data():
    """Loads the ds_salaries.csv dataset."""
    try:
        df = pd.read_csv('ds_salaries.csv')
        st.success("Dataset 'ds_salaries.csv' loaded successfully.")
        return df
    except FileNotFoundError:
        st.error("Error: 'ds_salaries.csv' not found. Please ensure the file is in the same directory as this script.")
        st.info("Falling back to synthetic data for demonstration purposes.")
        # Fallback to synthetic data if file not found
        def generate_synthetic_data(num_samples=500):
            np.random.seed(42)
            experience_levels = np.random.choice(['EN', 'MI', 'SE', 'EX'], num_samples, p=[0.2, 0.3, 0.4, 0.1])
            job_titles = np.random.choice([
                'Data Scientist', 'Machine Learning Engineer', 'Data Analyst',
                'AI Engineer', 'Data Engineer', 'Research Scientist'
            ], num_samples, p=[0.25, 0.20, 0.20, 0.15, 0.10, 0.10])
            company_locations = np.random.choice(['US', 'GB', 'CA', 'IN', 'DE', 'FR'], num_samples, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1])
            salary_in_usd = 60000 + \
                            np.where(experience_levels == 'MI', 30000, 0) + \
                            np.where(experience_levels == 'SE', 70000, 0) + \
                            np.where(experience_levels == 'EX', 120000, 0) + \
                            np.where(job_titles == 'Machine Learning Engineer', 20000, 0) + \
                            np.where(job_titles == 'Data Scientist', 15000, 0) + \
                            np.where(job_titles == 'Data Engineer', 10000, 0) + \
                            np.where(company_locations == 'US', 40000, 0) + \
                            np.where(company_locations == 'GB', 20000, 0) + \
                            np.random.normal(0, 15000, num_samples)
            salary_in_usd = np.maximum(30000, salary_in_usd).round(0)
            data = pd.DataFrame({
                'experience_level': experience_levels,
                'job_title': job_titles,
                'company_location': company_locations,
                'salary_in_usd': salary_in_usd
            })
            return data
        return generate_synthetic_data()

df = load_data()

# --- 2. Model Training ---
@st.cache_resource # Cache the model training to avoid re-training on every interaction
def train_model(dataframe):
    """Trains a Random Forest Regressor model using the provided DataFrame."""
    # Define features (X) and target (y) based on ds_salaries.csv
    X = dataframe[['experience_level', 'job_title', 'company_location']]
    y = dataframe['salary_in_usd']

    # Define categorical features (all selected features are categorical in this dataset)
    categorical_features = ['experience_level', 'job_title', 'company_location']
    numerical_features = [] # No numerical features directly used from this selection

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Create a pipeline that first preprocesses the data and then trains the model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

    # Train the model
    model_pipeline.fit(X, y)
    return model_pipeline, X.columns # Return model and feature names for consistent input

model, feature_columns = train_model(df)

# --- 3. Streamlit UI ---
st.set_page_config(layout="centered", page_title="Data Science Salary Predictor", page_icon="💰")

# Custom CSS for styling
st.markdown(
    """
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

    html, body, [class*="st-"] {
        font-family: 'Inter', sans-serif;
    }

    .main {
        background-color: #f8f9fa;
        padding: 30px;
        border-radius: 12px;
        box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1);
        max-width: 800px;
        margin: auto;
    }
    .stButton>button {
        background-color: #007bff; /* Blue */
        color: white;
        font-weight: bold;
        border-radius: 10px;
        border: none;
        padding: 12px 25px;
        cursor: pointer;
        transition: background-color 0.3s ease, transform 0.2s ease;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
        width: 100%;
        font-size: 1.1em;
    }
    .stButton>button:hover {
        background-color: #0056b3; /* Darker blue */
        transform: translateY(-2px);
    }
    .stTextInput>div>div>input, .stSelectbox>div>div>select {
        border-radius: 8px;
        border: 1px solid #ced4da;
        padding: 10px;
        box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.05);
    }
    .stMarkdown h1 {
        color: #343a40;
        text-align: center;
        font-family: 'Inter', sans-serif;
        margin-bottom: 25px;
        font-size: 2.5em;
        font-weight: 700;
    }
    .stMarkdown h2 {
        color: #495057;
        font-family: 'Inter', sans-serif;
        margin-top: 35px;
        font-size: 1.8em;
        font-weight: 600;
    }
    .prediction-box {
        background-color: #e9f7ef; /* Light green */
        border: 2px solid #28a745; /* Green */
        border-radius: 12px;
        padding: 25px;
        text-align: center;
        margin-top: 40px;
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
    }
    .prediction-text {
        font-size: 2.5em;
        font-weight: bold;
        color: #28a745; /* Green */
        margin-top: 10px;
        margin-bottom: 10px;
    }
    .stInfo, .stSuccess, .stError {
        border-radius: 8px;
        padding: 10px;
        margin-bottom: 15px;
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("💰 Data Science Salary Predictor")
st.write("Enter the details below to get an estimated salary based on the `ds_salaries.csv` dataset.")

# Input fields for user
col1, col2 = st.columns(2)

with col1:
    experience_level = st.selectbox(
        "Experience Level",
        options=df['experience_level'].unique(),
        index=df['experience_level'].value_counts().index.tolist().index('SE') if 'SE' in df['experience_level'].unique() else 0, # Default to 'SE' (Senior) if available
        help="Select your professional experience level (EN: Entry, MI: Mid, SE: Senior, EX: Executive)."
    )

with col2:
    company_location = st.selectbox(
        "Company Location (Country Code)",
        options=df['company_location'].unique(),
        index=df['company_location'].value_counts().index.tolist().index('US') if 'US' in df['company_location'].unique() else 0, # Default to 'US' if available
        help="Select the country where the company is located (e.g., US, GB, IN)."
    )

job_title = st.selectbox(
    "Job Title",
    options=df['job_title'].unique(),
    index=df['job_title'].value_counts().index.tolist().index('Data Scientist') if 'Data Scientist' in df['job_title'].unique() else 0, # Default to 'Data Scientist' if available
    help="Choose your specific job title in the data science field."
)


# Prediction button
if st.button("Predict Salary"):
    # Create a DataFrame from user inputs, ensuring column order matches training
    input_data = pd.DataFrame([[experience_level, job_title, company_location]],
                              columns=feature_columns) # Use feature_columns from training

    # Make prediction
    predicted_salary = model.predict(input_data)[0]

    # Display prediction
    st.markdown(
        f"""
        <div class="prediction-box">
            <h2>Estimated Salary:</h2>
            <p class="prediction-text">${predicted_salary:,.2f} USD</p>
            <p><i>(This is an estimated salary based on the trained model.)</i></p>
        </div>
        """,
        unsafe_allow_html=True
    )

st.markdown("---")
st.write("### About this Model:")
st.write(f"**Dataset used:** `ds_salaries.csv` (contains {len(df)} records)")
st.write(f"**Features for prediction:** Experience Level, Job Title, Company Location")
st.write(f"**Machine Learning Algorithm:** Random Forest Regressor")
st.write("*(Disclaimer: This model is for demonstration and estimation purposes only. Real-world salaries can vary significantly based on many factors not included in this simplified model.)*")

Overwriting salary_app.py


In [None]:
# Run Streamlit in the background
!nohup streamlit run salary_app.py --server.port 8501 --server.enableCORS false --server.enableXsrfProtection false > /dev/null 2>&1 &

# Give Streamlit a moment to start
import time
time.sleep(5)

# Start localtunnel and print the URL
print("\n--- YOUR STREAMLIT APP URL ---") # Added a clear header
!lt --port 8501
print("------------------------------")


--- YOUR STREAMLIT APP URL ---
your url is: https://hip-memes-yawn.loca.lt
^C
------------------------------


In [1]:
# Run this cell to install everything
!pip install streamlit pandas scikit-learn numpy pyngrok -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m454.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
%%writefile salary_app.py
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np # Used for potential fallback data if CSV not found

# --- 1. Data Loading and Preparation ---
@st.cache_data # Cache the data loading to avoid re-running on every interaction
def load_data():
    """Loads the ds_salaries.csv dataset."""
    try:
        df = pd.read_csv('/content/ds_salaries.csv')
        # st.success("Dataset 'ds_salaries.csv' loaded successfully.") # We can hide this in Colab
        return df
    except FileNotFoundError:
        st.error("Error: 'ds_salaries.csv' not found. Please ensure the file is uploaded to the Colab session.")
        st.info("Falling back to synthetic data for demonstration purposes.")
        # Fallback to synthetic data if file not found
        def generate_synthetic_data(num_samples=500):
            np.random.seed(42)
            experience_levels = np.random.choice(['EN', 'MI', 'SE', 'EX'], num_samples, p=[0.2, 0.3, 0.4, 0.1])
            job_titles = np.random.choice([
                'Data Scientist', 'Machine Learning Engineer', 'Data Analyst',
                'AI Engineer', 'Data Engineer', 'Research Scientist'
            ], num_samples, p=[0.25, 0.20, 0.20, 0.15, 0.10, 0.10])
            company_locations = np.random.choice(['US', 'GB', 'CA', 'IN', 'DE', 'FR'], num_samples, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1])
            salary_in_usd = 60000 + \
                np.where(experience_levels == 'MI', 30000, 0) + \
                np.where(experience_levels == 'SE', 70000, 0) + \
                np.where(experience_levels == 'EX', 120000, 0) + \
                np.where(job_titles == 'Machine Learning Engineer', 20000, 0) + \
                np.where(job_titles == 'Data Scientist', 15000, 0) + \
                np.where(job_titles == 'Data Engineer', 10000, 0) + \
                np.where(company_locations == 'US', 40000, 0) + \
                np.where(company_locations == 'GB', 20000, 0) + \
                np.random.normal(0, 15000, num_samples)
            salary_in_usd = np.maximum(30000, salary_in_usd).round(0)
            data = pd.DataFrame({
                'experience_level': experience_levels,
                'job_title': job_titles,
                'company_location': company_locations,
                'salary_in_usd': salary_in_usd
            })
            return data
        return generate_synthetic_data()

df = load_data()

# --- 2. Model Training ---
@st.cache_resource # Cache the model training to avoid re-training on every interaction
def train_model(dataframe):
    """Trains a Random Forest Regressor model using the provided DataFrame."""
    X = dataframe[['experience_level', 'job_title', 'company_location']]
    y = dataframe['salary_in_usd']
    categorical_features = ['experience_level', 'job_title', 'company_location']
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])
    model_pipeline.fit(X, y)
    return model_pipeline, X.columns

model, feature_columns = train_model(df)

# --- 3. Streamlit UI ---
st.set_page_config(layout="centered", page_title="Data Science Salary Predictor", page_icon="💰")
st.markdown(
    """
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');
    html, body, [class*="st-"] { font-family: 'Inter', sans-serif; }
    .main { background-color: #f8f9fa; padding: 30px; border-radius: 12px; box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1); max-width: 800px; margin: auto; }
    .stButton>button { background-color: #007bff; color: white; font-weight: bold; border-radius: 10px; border: none; padding: 12px 25px; cursor: pointer; transition: background-color 0.3s ease, transform 0.2s ease; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); width: 100%; font-size: 1.1em; }
    .stButton>button:hover { background-color: #0056b3; transform: translateY(-2px); }
    .stTextInput>div>div>input, .stSelectbox>div>div>select { border-radius: 8px; border: 1px solid #ced4da; padding: 10px; box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.05); }
    .stMarkdown h1 { color: #343a40; text-align: center; font-family: 'Inter', sans-serif; margin-bottom: 25px; font-size: 2.5em; font-weight: 700; }
    .stMarkdown h2 { color: #495057; font-family: 'Inter', sans-serif; margin-top: 35px; font-size: 1.8em; font-weight: 600; }
    .prediction-box { background-color: #e9f7ef; border: 2px solid #28a745; border-radius: 12px; padding: 25px; text-align: center; margin-top: 40px; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15); }
    .prediction-text { font-size: 2.5em; font-weight: bold; color: #28a745; margin-top: 10px; margin-bottom: 10px; }
    .stInfo, .stSuccess, .stError { border-radius: 8px; padding: 10px; margin-bottom: 15px; }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("💰 Data Science Salary Predictor")
st.write("Enter the details below to get an estimated salary based on the `ds_salaries.csv` dataset.")

col1, col2 = st.columns(2)

with col1:
    experience_level = st.selectbox(
        "Experience Level",
        options=df['experience_level'].unique(),
        index=df['experience_level'].value_counts().index.tolist().index('SE') if 'SE' in df['experience_level'].unique() else 0,
        help="Select your professional experience level (EN: Entry, MI: Mid, SE: Senior, EX: Executive)."
    )

with col2:
    company_location = st.selectbox(
        "Company Location (Country Code)",
        options=df['company_location'].unique(),
        index=df['company_location'].value_counts().index.tolist().index('US') if 'US' in df['company_location'].unique() else 0,
        help="Select the country where the company is located (e.g., US, GB, IN)."
    )

job_title = st.selectbox(
    "Job Title",
    options=df['job_title'].unique(),
    index=df['job_title'].value_counts().index.tolist().index('Data Scientist') if 'Data Scientist' in df['job_title'].unique() else 0,
    help="Choose your specific job title in the data science field."
)

if st.button("Predict Salary"):
    input_data = pd.DataFrame([[experience_level, job_title, company_location]],
                                  columns=feature_columns)
    predicted_salary = model.predict(input_data)[0]
    st.markdown(
        f"""
        <div class="prediction-box">
            <h2>Estimated Salary:</h2>
            <p class="prediction-text">${predicted_salary:,.2f} USD</p>
            <p><i>(This is an estimated salary based on the trained model.)</i></p>
        </div>
        """,
        unsafe_allow_html=True
    )

st.markdown("---")
st.write("### About this Model:")
st.write(f"**Dataset used:** `ds_salaries.csv` (contains {len(df)} records)")
st.write(f"**Features for prediction:** Experience Level, Job Title, Company Location")
st.write(f"**Machine Learning Algorithm:** Random Forest Regressor")
st.write("*(Disclaimer: This model is for demonstration and estimation purposes only.)*")

Writing salary_app.py


In [6]:
from pyngrok import ngrok
import os

# --- PASTE YOUR AUTHTOKEN HERE ---
# Get your token from https://dashboard.ngrok.com/get-started/your-authtoken
authtoken = "30YE0zbieaR2mKpt4OdZzBgPKnw_7bihWihUwvnoR4xLuvG7b"
ngrok.set_auth_token(authtoken)

# Terminate any existing ngrok tunnels (good practice)
ngrok.kill()

# Run the streamlit app in the background
os.system("streamlit run salary_app.py &")

# Create a public URL to the streamlit app
public_url = ngrok.connect(8501)
print("✅ Your app is live!")
print("Click this link to open:", public_url)

✅ Your app is live!
Click this link to open: NgrokTunnel: "https://85df130f9bc2.ngrok-free.app" -> "http://localhost:8501"
