In [54]:
## normal logistic regression
## preprocessed data import
import numpy as np
import pandas as pd
import math as m
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load the dataset
url = "https://raw.githubusercontent.com/The-Girlies/Wellness-Watch/main/processed_data.csv"
data = pd.read_csv(url)

data = data.dropna(subset=['statement', 'status'])
data_used = data[['statement', 'status']]

In [55]:
## binary: testing Normal and everything else

data_used['status_value'] = (data_used['status'] == 'Normal').astype(int)

# unique_status = np.unique(data['status'])
# print(f"Unique status values: {unique_status}\n")

X = data_used['statement']
y = data_used['status_value']

vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(word_frequency, y, test_size=0.2, random_state=24, stratify=y)

LogisticModel = LogisticRegression()
LogisticModel.fit(X_train, y_train)

y_pred = LogisticModel.predict(X_test)

## Accuracy and classification report
print(f"Accuracy (Normal): {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## Cost of logistic regression
y_probability = LogisticModel.predict_proba(X_test)[:, 1]
logCost = log_loss(y_pred, y_probability)
print(f"Log-loss Cost (Normal): {np.round(logCost, 10)}")

Accuracy (Normal): 0.9425832779728576
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      7268
           1       0.90      0.92      0.91      3269

    accuracy                           0.94     10537
   macro avg       0.93      0.94      0.93     10537
weighted avg       0.94      0.94      0.94     10537

Log-loss Cost (Normal): 0.1025131363


In [56]:
## binary: testing Anxiety and everything else

data_used['status_value'] = (data_used['status'] == 'Anxiety').astype(int)

X = data_used['statement']
y = data_used['status_value']

vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(word_frequency, y, test_size=0.2, random_state=24, stratify=y)

LogisticModel = LogisticRegression()
LogisticModel.fit(X_train, y_train)

y_pred = LogisticModel.predict(X_test)

## Accuracy and classification report
print(f"Accuracy (Anxiety): {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## Cost of logistic regression
y_probability = LogisticModel.predict_proba(X_test)[:, 1]
logCost = log_loss(y_pred, y_probability)
print(f"Log-loss Cost (Anxiety): {np.round(logCost, 10)}")

Accuracy (Anxiety): 0.9615640125272849
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9769
           1       0.87      0.55      0.68       768

    accuracy                           0.96     10537
   macro avg       0.92      0.77      0.83     10537
weighted avg       0.96      0.96      0.96     10537

Log-loss Cost (Anxiety): 0.0508027424


In [57]:
## binary: testing Bipolar and everything else

data_used['status_value'] = (data_used['status'] == 'Bipolar').astype(int)

X = data_used['statement']
y = data_used['status_value']

vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(word_frequency, y, test_size=0.2, random_state=24, stratify=y)

LogisticModel = LogisticRegression()
LogisticModel.fit(X_train, y_train)

y_pred = LogisticModel.predict(X_test)

## Accuracy and classification report
print(f"Accuracy (Bipolar): {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## Cost of logistic regression
y_probability = LogisticModel.predict_proba(X_test)[:, 1]
logCost = log_loss(y_pred, y_probability)
print(f"Log-loss Cost (Bipolar): {np.round(logCost, 10)}")

Accuracy (Bipolar): 0.9676378475847015
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      9982
           1       0.90      0.43      0.59       555

    accuracy                           0.97     10537
   macro avg       0.93      0.72      0.78     10537
weighted avg       0.97      0.97      0.96     10537

Log-loss Cost (Bipolar): 0.0394306238


In [58]:
## binary: testing Depression and everything else

data_used['status_value'] = (data_used['status'] == 'Depression').astype(int)

X = data_used['statement']
y = data_used['status_value']

vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(word_frequency, y, test_size=0.2, random_state=24, stratify=y)

LogisticModel = LogisticRegression()
LogisticModel.fit(X_train, y_train)

y_pred = LogisticModel.predict(X_test)

## Accuracy and classification report
print(f"Accuracy (Depression): {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## Cost of logistic regression
y_probability = LogisticModel.predict_proba(X_test)[:, 1]
logCost = log_loss(y_pred, y_probability)
print(f"Log-loss Cost (Depression): {np.round(logCost, 10)}")

Accuracy (Depression): 0.8177849482774984
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      7456
           1       0.76      0.56      0.64      3081

    accuracy                           0.82     10537
   macro avg       0.80      0.74      0.76     10537
weighted avg       0.81      0.82      0.81     10537

Log-loss Cost (Depression): 0.2311239866


In [59]:
## binary: testing Personality disorder and everything else

data_used['status_value'] = (data_used['status'] == 'Personality disorder').astype(int)

X = data_used['statement']
y = data_used['status_value']

vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(word_frequency, y, test_size=0.2, random_state=24, stratify=y)

LogisticModel = LogisticRegression()
LogisticModel.fit(X_train, y_train)

y_pred = LogisticModel.predict(X_test)

## Accuracy and classification report
print(f"Accuracy (Personality disorder): {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## Cost of logistic regression
y_probability = LogisticModel.predict_proba(X_test)[:, 1]
logCost = log_loss(y_pred, y_probability)
print(f"Log-loss Cost (Personality disorder): {np.round(logCost, 10)}")

Accuracy (Personality disorder): 0.983296953592104
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     10322
           1       1.00      0.18      0.31       215

    accuracy                           0.98     10537
   macro avg       0.99      0.59      0.65     10537
weighted avg       0.98      0.98      0.98     10537

Log-loss Cost (Personality disorder): 0.0181866751


In [60]:
## binary: testing Stress and everything else

data_used['status_value'] = (data_used['status'] == 'Stress').astype(int)

X = data_used['statement']
y = data_used['status_value']

vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(word_frequency, y, test_size=0.2, random_state=24, stratify=y)

LogisticModel = LogisticRegression()
LogisticModel.fit(X_train, y_train)

y_pred = LogisticModel.predict(X_test)

## Accuracy and classification report
print(f"Accuracy (Stress): {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## Cost of logistic regression
y_probability = LogisticModel.predict_proba(X_test)[:, 1]
logCost = log_loss(y_pred, y_probability)
print(f"Log-loss Cost (Stress): {np.round(logCost, 10)}")

Accuracy (Stress): 0.9592863243807536
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     10020
           1       0.84      0.21      0.33       517

    accuracy                           0.96     10537
   macro avg       0.90      0.60      0.66     10537
weighted avg       0.95      0.96      0.95     10537

Log-loss Cost (Stress): 0.0456689442


In [61]:
## binary: testing Suicidal and everything else

data_used['status_value'] = (data_used['status'] == 'Suicidal').astype(int)

X = data_used['statement']
y = data_used['status_value']

vectorizer = TfidfVectorizer()
word_frequency = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(word_frequency, y, test_size=0.2, random_state=24, stratify=y)

LogisticModel = LogisticRegression()
LogisticModel.fit(X_train, y_train)

y_pred = LogisticModel.predict(X_test)

## Accuracy and classification report
print(f"Accuracy (Suicidal): {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## Cost of logistic regression
y_probability = LogisticModel.predict_proba(X_test)[:, 1]
logCost = log_loss(y_pred, y_probability)
print(f"Log-loss Cost (Suicidal): {np.round(logCost, 10)}")

Accuracy (Suicidal): 0.8538483439309101
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      8406
           1       0.71      0.48      0.57      2131

    accuracy                           0.85     10537
   macro avg       0.79      0.71      0.74     10537
weighted avg       0.84      0.85      0.84     10537

Log-loss Cost (Suicidal): 0.1689629304


In [62]:
## normal logistic regression
## preprocessed data import
import numpy as np
import pandas as pd
import math as m
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load the dataset
url = "https://raw.githubusercontent.com/The-Girlies/Wellness-Watch/main/processed_data.csv"
data = pd.read_csv(url)

data = data.dropna(subset=['statement', 'status'])
data_used = data[['statement', 'status']]

## Multiclass Regression (Softmax (Multinomial Logistic Regression)) 
## multi_class = multinomial, solver = lbfgs

data_soft = data[['statement', 'status']]

X_soft = data_soft['statement']
y_soft = data_soft['status']

soft_vectorizer = TfidfVectorizer()
X_soft_tfidf = soft_vectorizer.fit_transform(X)

X_soft_train, X_soft_test, y_soft_train, y_soft_test = train_test_split(X_soft_tfidf, y_soft, test_size=0.2, random_state=42, stratify=y_soft)

model_softmax = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, tol=1e-4)
# report that max_iter changed from 100 to max_iter=1000
model_softmax.fit(X_soft_train, y_soft_train)

y_pred_softmax = model_softmax.predict(X_soft_test)

# Accuracy for Softmax
print("Softmax (Multinomial) Accuracy:", accuracy_score(y_soft_test, y_pred_softmax))
print(classification_report(y_soft_test, y_pred_softmax))

# Cost for Softmax
y_prob_softmax = model_softmax.predict_proba(X_soft_test)
cost_softmax = log_loss(y_soft_test, y_prob_softmax)
print(f'Softmax (Multiclass) Log-Loss (Cost): {cost_softmax}')

Softmax (Multinomial) Accuracy: 0.7512574736642308
                      precision    recall  f1-score   support

             Anxiety       0.85      0.71      0.77       768
             Bipolar       0.88      0.62      0.73       556
          Depression       0.67      0.74      0.71      3081
              Normal       0.84      0.95      0.89      3269
Personality disorder       0.91      0.39      0.54       215
              Stress       0.63      0.39      0.48       517
            Suicidal       0.67      0.63      0.65      2131

            accuracy                           0.75     10537
           macro avg       0.78      0.63      0.68     10537
        weighted avg       0.75      0.75      0.74     10537

Softmax (Multiclass) Log-Loss (Cost): 0.6795263560032236


In [63]:
## normal logistic regression
## preprocessed data import
import numpy as np
import pandas as pd
import math as m
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load the dataset
url = "https://raw.githubusercontent.com/The-Girlies/Wellness-Watch/main/processed_data.csv"
data = pd.read_csv(url)

data = data.dropna(subset=['statement', 'status'])
data_used = data[['statement', 'status']]

## Multiclass Regression (Softmax (Multinomial Logistic Regression)) 
## multi_class = multinomial, solver = sag

data_soft = data[['statement', 'status']]

X_soft = data_soft['statement']
y_soft = data_soft['status']

soft_vectorizer = TfidfVectorizer()
X_soft_tfidf = soft_vectorizer.fit_transform(X)

X_soft_train, X_soft_test, y_soft_train, y_soft_test = train_test_split(X_soft_tfidf, y_soft, test_size=0.2, random_state=42, stratify=y_soft)

model_softmax = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=1000, tol=1e-4)
# report that max_iter changed from 100 to max_iter=1000
model_softmax.fit(X_soft_train, y_soft_train)

y_pred_softmax = model_softmax.predict(X_soft_test)

# Accuracy for Softmax
print("Softmax (Multinomial) Accuracy:", accuracy_score(y_soft_test, y_pred_softmax))
print(classification_report(y_soft_test, y_pred_softmax))

# Cost for Softmax
y_prob_softmax = model_softmax.predict_proba(X_soft_test)
cost_softmax = log_loss(y_soft_test, y_prob_softmax)
print(f'Softmax (Multiclass) Log-Loss (Cost): {cost_softmax}')

Softmax (Multinomial) Accuracy: 0.7504033406092816
                      precision    recall  f1-score   support

             Anxiety       0.84      0.71      0.77       768
             Bipolar       0.88      0.62      0.73       556
          Depression       0.67      0.74      0.71      3081
              Normal       0.84      0.95      0.89      3269
Personality disorder       0.91      0.38      0.54       215
              Stress       0.62      0.39      0.48       517
            Suicidal       0.67      0.63      0.65      2131

            accuracy                           0.75     10537
           macro avg       0.78      0.63      0.68     10537
        weighted avg       0.75      0.75      0.74     10537

Softmax (Multiclass) Log-Loss (Cost): 0.67962029677151


In [64]:
## normal logistic regression
## preprocessed data import
import numpy as np
import pandas as pd
import math as m
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load the dataset
url = "https://raw.githubusercontent.com/The-Girlies/Wellness-Watch/main/processed_data.csv"
data = pd.read_csv(url)

data = data.dropna(subset=['statement', 'status'])
data_used = data[['statement', 'status']]

## Multiclass Regression (Softmax (Multinomial Logistic Regression)) 
## multi_class = multinomial, solver = saga

data_soft = data[['statement', 'status']]

X_soft = data_soft['statement']
y_soft = data_soft['status']

soft_vectorizer = TfidfVectorizer()
X_soft_tfidf = soft_vectorizer.fit_transform(X)

X_soft_train, X_soft_test, y_soft_train, y_soft_test = train_test_split(X_soft_tfidf, y_soft, test_size=0.2, random_state=42, stratify=y_soft)

model_softmax = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, tol=1e-4)
# report that max_iter changed from 100 to max_iter=1000
model_softmax.fit(X_soft_train, y_soft_train)

y_pred_softmax = model_softmax.predict(X_soft_test)

# Accuracy for Softmax
print("Softmax (Multinomial) Accuracy:", accuracy_score(y_soft_test, y_pred_softmax))
print(classification_report(y_soft_test, y_pred_softmax))

# Cost for Softmax
y_prob_softmax = model_softmax.predict_proba(X_soft_test)
cost_softmax = log_loss(y_soft_test, y_prob_softmax)
print(f'Softmax (Multiclass) Log-Loss (Cost): {cost_softmax}')

Softmax (Multinomial) Accuracy: 0.7503084369365094
                      precision    recall  f1-score   support

             Anxiety       0.84      0.71      0.77       768
             Bipolar       0.88      0.62      0.73       556
          Depression       0.67      0.74      0.71      3081
              Normal       0.84      0.95      0.89      3269
Personality disorder       0.91      0.39      0.54       215
              Stress       0.62      0.39      0.48       517
            Suicidal       0.67      0.62      0.65      2131

            accuracy                           0.75     10537
           macro avg       0.78      0.63      0.68     10537
        weighted avg       0.75      0.75      0.74     10537

Softmax (Multiclass) Log-Loss (Cost): 0.679661004101361


In [65]:
## normal logistic regression
## preprocessed data import
import numpy as np
import pandas as pd
import math as m
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load the dataset
url = "https://raw.githubusercontent.com/The-Girlies/Wellness-Watch/main/processed_data.csv"
data = pd.read_csv(url)

data = data.dropna(subset=['statement', 'status'])
data_used = data[['statement', 'status']]

## Multiclass Regression (Softmax (Multinomial Logistic Regression)) 
## multi_class = multinomial, solver = newton-cg

data_soft = data[['statement', 'status']]

X_soft = data_soft['statement']
y_soft = data_soft['status']

soft_vectorizer = TfidfVectorizer()
X_soft_tfidf = soft_vectorizer.fit_transform(X)

X_soft_train, X_soft_test, y_soft_train, y_soft_test = train_test_split(X_soft_tfidf, y_soft, test_size=0.2, random_state=42, stratify=y_soft)

model_softmax = LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter=1000, tol=1e-4)
# report that max_iter changed from 100 to max_iter=1000
model_softmax.fit(X_soft_train, y_soft_train)

y_pred_softmax = model_softmax.predict(X_soft_test)

# Accuracy for Softmax
print("Softmax (Multinomial) Accuracy:", accuracy_score(y_soft_test, y_pred_softmax))
print(classification_report(y_soft_test, y_pred_softmax))

# Cost for Softmax
y_prob_softmax = model_softmax.predict_proba(X_soft_test)
cost_softmax = log_loss(y_soft_test, y_prob_softmax)
print(f'Softmax (Multiclass) Log-Loss (Cost): {cost_softmax}')

Softmax (Multinomial) Accuracy: 0.7498339185726488
                      precision    recall  f1-score   support

             Anxiety       0.84      0.71      0.77       768
             Bipolar       0.88      0.62      0.73       556
          Depression       0.67      0.74      0.71      3081
              Normal       0.84      0.95      0.89      3269
Personality disorder       0.91      0.38      0.54       215
              Stress       0.63      0.39      0.48       517
            Suicidal       0.67      0.62      0.65      2131

            accuracy                           0.75     10537
           macro avg       0.78      0.63      0.68     10537
        weighted avg       0.75      0.75      0.74     10537

Softmax (Multiclass) Log-Loss (Cost): 0.6797596909604097
