## Running LR with Hyperparamter Tuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the preprocessed dataset
dataset = pd.read_csv('preprocessed_mental_health_text.csv')

# Check if 'cleaned_text' and 'mental_health_issue' columns exist
if 'cleaned_text' not in dataset.columns or 'mental_health_issue' not in dataset.columns:
    raise ValueError("The dataset must have 'cleaned_text' and 'mental_health_issue' columns.")

# Remove rows with missing values in 'cleaned_text' column
dataset.dropna(subset=['cleaned_text'], inplace=True)

# Initialize the CountVectorizer and fit/transform the cleaned text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset['cleaned_text'])

# Prepare the target variable
y = dataset['mental_health_issue']

# Split the dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=500)

# Define the hyperparameter grid for Randomized Search
param_distributions = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],       # Inverse of regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization types
    'solver': ['liblinear', 'saga']             # Solvers that support l1, elasticnet
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions,
                                   n_iter=10, scoring='accuracy', cv=5, n_jobs=-1, random_state=42)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best hyperparameters from Random Search
print("Best Hyperparameters:", random_search.best_params_)

# Best model from Random Search
best_model = random_search.best_estimator_

# Make predictions using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1204, in fit
    raise ValueError("l1_ratio must be specified when penalty is elasticnet.")
ValueError: l1_ratio must be specified when penalty is elasticnet.

---------------------------------

Best Hyperparameters: {'solver': 'saga', 'penalty': 'l2', 'C': 0.1}
Accuracy: 75.96%
Classification Report:
               precision    recall  f1-score   support

     anxiety       0.78      0.78      0.78       416
     bipolar       0.74      0.77      0.75       412
  depression       0.73      0.79      0.76       443
     neutral       0.12      0.18      0.14        17
      normal       0.89      0.25      0.39        32
        ptsd       0.84      0.76      0.80       427

    accuracy                           0.76      1747
   macro avg       0.68      0.59      0.60      1747
weighted avg       0.77      0.76      0.76      1747





## Saving the model and Vectorizer

In [None]:
import joblib

# Save the best model and vectorizer
joblib.dump(best_model, 'logistic_regression_model.pkl')
joblib.dump(vectorizer, 'count_vectorizer.pkl')


['count_vectorizer.pkl']

## Streamlit App

In [16]:
!pip install streamlit
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [24]:
%%writefile app.py
import streamlit as st
import joblib

# Load the saved model and vectorizer
model = joblib.load('/content/logistic_regression_model.pkl')
vectorizer = joblib.load('/content/count_vectorizer.pkl')

# Define the Streamlit app
def run_app():
    st.title('Mental Health Classifier')

    # Text input
    input_text = st.text_area("Enter your text here:")

    # Button to make prediction
    if st.button("Classify"):
        if input_text.strip() == "":
            st.write("Please enter some text to classify.")
        else:
            # Preprocess and vectorize the input
            input_vectorized = vectorizer.transform([input_text])

            # Make prediction
            prediction = model.predict(input_vectorized)

            # Output the result
            st.write(f"The predicted mental health issue is: {prediction[0]}")

# Run the app
if __name__ == '__main__':
    run_app()


Writing app.py


In [26]:
# Import ngrok
from pyngrok import ngrok

# Start Streamlit with nohup
!nohup streamlit run app.py &

# Create a public URL with ngrok to access the app
# The issue is that you're passing `port` instead of `addr`
# `ngrok.connect` expects `addr` which should be in the format "protocol://address:port"
# In your case, you're running streamlit on port 8501, so it should be "http://localhost:8501"
public_url = ngrok.connect(addr='8501')
print(f"Public URL: {public_url}")

nohup: appending output to 'nohup.out'
Public URL: NgrokTunnel: "https://7a48-34-125-35-114.ngrok-free.app" -> "http://localhost:8501"
