In [1]:
print('hello, world!')

hello, world!


#### Problem Statement: With resources for mental health being so limited in the US, and culture suppressing those who need help from seeking it, it would be immensely valuable to provide a model which could predict whether an individual was suffering from various mental health issues using sentiment analysis as a backbone for classification. We'll use Recall/Sensitivity as our main metric since False Positives would suggest people get help when they may not necessarily need (but everyone needs it) vs a false negative that could minimize serious health issues. We'll set our target at least 95% recall before production-ready.

In [73]:
import pandas as pd
import numpy as np
import random

import re

import spacy

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix


### Setting the random seed

In [40]:
random.seed(42)

In [4]:
df = pd.read_csv('../data/sentiment.csv').drop(columns=['Unnamed: 0'])

In [5]:
df.head(50)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [41]:
df2 = pd.read_csv('../data/sentiment.csv').drop(columns=['Unnamed: 0'])

In [42]:
df2.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [6]:
df.shape


(53043, 2)

In [7]:
df['status'].value_counts()

status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64

#### I am following along with 'Sentiment Analysis using NLP (spacey/ ntlk)' on Spencer Pao github.
#### https://github.com/SpencerPao/Natural-Language-Processing/blob/main/Sentiment_Analysis/Recommendation.ipynb

### Tokenization
#### Removing special characters with regex.

In [14]:
pattern = r'[^A-Za-z ]'
regex = re.compile(pattern)
df['tokens'] = df['statement'].head(50).apply(lambda x: regex.sub('', x))

#### Using spacy en_core_web_sm to create tokens.

In [17]:
nlp = spacy.load("en_core_web_sm")
df['tokens'] = df['statement'].apply(lambda x: [token.text for token in nlp(str(x))])

In [18]:
df['tokens'].head()

0                                       [oh, my, gosh]
1    [trouble, sleeping, ,, confused, mind, ,, rest...
2    [All, wrong, ,, back, off, dear, ,, forward, d...
3    [I, 've, shifted, my, focus, to, something, el...
4    [I, 'm, restless, and, restless, ,, it, 's, be...
Name: tokens, dtype: object

In [19]:
df.head()

Unnamed: 0,statement,status,Tokens,tokens
0,oh my gosh,Anxiety,"[oh, my, gosh]","[oh, my, gosh]"
1,"trouble sleeping, confused mind, restless hear...",Anxiety,"[trouble, sleeping, ,, confused, mind, ,, rest...","[trouble, sleeping, ,, confused, mind, ,, rest..."
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,"[All, wrong, ,, back, off, dear, ,, forward, d...","[All, wrong, ,, back, off, dear, ,, forward, d..."
3,I've shifted my focus to something else but I'...,Anxiety,"[I, 've, shifted, my, focus, to, something, el...","[I, 've, shifted, my, focus, to, something, el..."
4,"I'm restless and restless, it's been a month n...",Anxiety,"[I, 'm, restless, and, restless, ,, it, 's, be...","[I, 'm, restless, and, restless, ,, it, 's, be..."


In [20]:
df=df.drop(columns = 'Tokens')

In [21]:
df.head()

Unnamed: 0,statement,status,tokens
0,oh my gosh,Anxiety,"[oh, my, gosh]"
1,"trouble sleeping, confused mind, restless hear...",Anxiety,"[trouble, sleeping, ,, confused, mind, ,, rest..."
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,"[All, wrong, ,, back, off, dear, ,, forward, d..."
3,I've shifted my focus to something else but I'...,Anxiety,"[I, 've, shifted, my, focus, to, something, el..."
4,"I'm restless and restless, it's been a month n...",Anxiety,"[I, 'm, restless, and, restless, ,, it, 's, be..."


In [29]:
df['tokens'].shape

(53043,)

### Lemmatization (normalizing words)
#### Converting text to the 'base' format.

#### Will first use spacy function .is_stop to check if the tokens are a stop word in the spacy list. There are 326 stop words in the default spacy list. See the words below. Then if the token is not a stop word, will use the spacy lemma_ function to limmatize the token and save it.

In [27]:
stopwords = nlp.Defaults.stop_words

print(len(stopwords))
print(stopwords)

326
{'unless', 'ours', 'two', 'most', 'full', 'all', 'whole', 'whether', 'across', 'one', 'they', 'again', 'was', 'us', 'move', 'cannot', 'twelve', 'more', 'anyhow', 'someone', 'too', 'nevertheless', 'these', 'hers', 'themselves', 'up', 'had', 'only', 'beside', 'below', 'whose', 'when', 'although', 'with', 'its', 'for', 'third', 'together', 'indeed', 'becoming', 'than', 'over', 'until', '‘ll', 'were', 'ca', 'yourself', 'just', 'am', 'other', 'nowhere', 'ten', 'per', 'nor', 'your', 'whoever', "'re", 'afterwards', 'will', 'myself', 'should', 're', 'everything', 'such', 'if', 'herein', '’re', 'down', 'where', 'above', 'say', 'she', 'sometimes', 'own', 'next', 'rather', 'must', 'keep', 'seemed', 'their', 'yourselves', 'put', 'does', 'enough', 'further', 'anything', 'towards', 'each', 'every', 'empty', 'everywhere', 'which', 'four', '‘d', 'made', 'could', 'on', 'beyond', 'else', '’d', 'nobody', 'formerly', 'get', 'thus', 'because', 'what', 'the', 'at', 'quite', 'onto', 'therein', 'several',

In [30]:
df['lemmas'] = df['tokens'].apply(lambda x: [token.lemma_ for token in nlp(str(x)) if not token.is_stop])

In [31]:
df.head()

Unnamed: 0,statement,status,tokens,lemmas
0,oh my gosh,Anxiety,"[oh, my, gosh]","[[, ', oh, ', ,, ', ', ,, ', gosh, ', ]]"
1,"trouble sleeping, confused mind, restless hear...",Anxiety,"[trouble, sleeping, ,, confused, mind, ,, rest...","[[, ', trouble, ', ,, ', sleep, ', ,, ', ,, ',..."
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,"[All, wrong, ,, back, off, dear, ,, forward, d...","[[, ', ', ,, ', wrong, ', ,, ', ,, ', ,, ', ',..."
3,I've shifted my focus to something else but I'...,Anxiety,"[I, 've, shifted, my, focus, to, something, el...","[[, ', ', ,, "", ', ve, "", ,, ', shift, ', ,, '..."
4,"I'm restless and restless, it's been a month n...",Anxiety,"[I, 'm, restless, and, restless, ,, it, 's, be...","[[, ', ', ,, "", ', m, "", ,, ', restless, ', ,,..."


In [32]:
df.shape

(53043, 4)

#### The above work looks like I may be going down the wrong path. After discussion with Hank, we are going to try using CountVectorizer. See the notes below from our discussion:

#### A few notes from what we discussed:
#### * CountVectorizer and TfidfVectorizer in SKlearn can be used in a Pipeline and GridSearched over.
#### * You can do this all locally
#### * Multinomial Naive Bayes also does well with the CVEC and Tfidf Vectorizers
#### How to get data to Colab

#### To mount your drive
from google.colab import drive

drive.mount('/content/drive')

#### or upload

from google.colab import files

uploaded = files.upload()

import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['whatever_you_uploaded.csv']))

df.head()


#### Sidenote, depending on how large the data file is, don't push it to GitHub so you can avoid dealing with a large file issue.

#### https://www.nltk.org/_modules/nltk/sentiment/vader.html

---------

### Reloaded data as df2 to start with  CountVectorizer

In [38]:
df2.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [39]:
df2.shape

(53043, 2)

In [48]:
df2.dtypes

statement    object
status       object
dtype: object

In [47]:
df2.isnull().sum()

statement    362
status         0
dtype: int64

#### Only 362 rows out of 53,043 so will drop the rows with null values

In [49]:
df2_cleaned = df2.dropna()

In [50]:
df2_cleaned.shape

(52681, 2)

In [51]:
df2_cleaned.isnull().sum()

statement    0
status       0
dtype: int64

In [52]:
vectorizer = CountVectorizer(stop_words='english')
df2_cleaned_statement_vectorized = vectorizer.fit_transform(df2_cleaned['statement'])

#### df2_cleaned_statement_vectorized is a sparse matrix

In [53]:
df2_cleaned_statement_vectorized.shape

(52681, 58930)

In [54]:
vectorizer.vocabulary_

{'oh': 37004,
 'gosh': 22810,
 'trouble': 53560,
 'sleeping': 47541,
 'confused': 11893,
 'mind': 33619,
 'restless': 43850,
 'heart': 24243,
 'tune': 53760,
 'wrong': 57886,
 'dear': 13888,
 'forward': 21049,
 'doubt': 16327,
 'stay': 49212,
 'place': 39770,
 've': 55559,
 'shifted': 46637,
 'focus': 20758,
 'worried': 57697,
 'month': 34310,
 'boy': 8183,
 'mean': 32772,
 'break': 8332,
 'nervous': 35692,
 'like': 30827,
 'heck': 24311,
 'feel': 19912,
 'scared': 45425,
 'anxious': 4492,
 'family': 19571,
 'protected': 41279,
 'felt': 19981,
 'didn': 15104,
 'know': 29646,
 'haven': 24065,
 'slept': 47562,
 'days': 13764,
 'huh': 25455,
 'really': 42593,
 'want': 56448,
 'night': 35967,
 'don': 16181,
 'strange': 49586,
 'feeling': 19917,
 'good': 22723,
 'lately': 30186,
 'time': 52427,
 'sleep': 47533,
 'needed': 35548,
 'problem': 40980,
 'laugh': 30220,
 'forget': 20956,
 'remember': 43389,
 'turns': 53798,
 'sad': 44956,
 'burden': 8845,
 'blah': 7537,
 'worry': 57707,
 'thought

In [61]:
# df2_cleaned_statement_vectorized.toarray()

#### Need to store the vectorized data to use for modeling

In [57]:
# df2_cleaned['statement_vectorized'] = [row for row in df2_cleaned_statement_vectorized.toarray()]

#### I cannot get the sparse matrix to save to an array or convert to a DataFrame because it is too big. I am going to try to tie everything in a pipeline.

In [62]:
df2_cleaned.shape

(52681, 2)

In [65]:
df2_cleaned['status'].value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

 #### Setting 'Normal' to have a value of 1 and all other status as a value of 0 to use as the positive/ negative target variable/

In [112]:
y = df2_cleaned['status'].map({'Normal':1}).fillna(0).astype(int)

In [68]:
df2_cleaned['status'].value_counts()

status
0    36338
1    16343
Name: count, dtype: int64

In [69]:
df2_cleaned['status'].dtypes

dtype('int64')

In [98]:
df2_cleaned.shape

(52681, 2)

#### Setting X and y variables

In [111]:
X = df2_cleaned.drop(columns='status').astype(str)



In [118]:
y.shape

(52681,)

In [119]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: status, dtype: int64

In [100]:
X.shape

(52681, 1)

#### Splitting into train and test

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

#### Baseline

In [76]:
y_test.value_counts(normalize = True)

status
0    0.689773
1    0.310227
Name: proportion, dtype: float64

#### Building pipeline with vectorizer and logistic regression

In [96]:
X_train.shape

(39510, 1)

In [97]:
y_train.shape

(39510,)

In [114]:
pipeline_logr = Pipeline([
    ('vector', CountVectorizer(stop_words='english')),
    ('logr', LogisticRegression(random_state=42))
])

In [106]:
#pipeline_logr.fit(X_train, y_train)

In [115]:
pipeline_logr_params = {'logr__penalty': ['l1', 'l2'],
                        'logr__C': [1.0, 0.5, 0.1]}

In [116]:
gs_logr = GridSearchCV(pipeline_logr, pipeline_logr_params, cv=5, verbose=1, n_jobs=-1)

In [117]:
gs_logr.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\pipeline.py", line 661, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1210, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\linear_model\_logistic.py", line 63, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or None penalties, got l1 penalty.

--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\pipeline.py", line 661, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1239, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\utils\validation.py", line 2971, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\utils\validation.py", line 1387, in check_X_y
    check_consistent_length(X, y)
  File "C:\Users\Beau_\virtual\dsb_602\Lib\site-packages\sklearn\utils\validation.py", line 473, in check_consistent_length
    raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [1, 31608]
