# Imports and data exploration

In [1]:
import pandas as pd
import numpy as np
import spacy
import os

In [2]:
# read the data into a pandas dataframe
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfpro = data2df('HealthProNonPro/HealthProNonPro/Pro/', 0) # Pro
dfnonpro = data2df('HealthProNonPro/HealthProNonPro/NonPro/', 1) # NonPro

Pro - Professional answers by verified doctors

NonPro - Answers by other members of the community

In [5]:
df = pd.concat([dfpro, dfnonpro], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
825,ans1741.txt,White patches on the skin could be caused by a...,0
1430,ans6.txt,A cat scratch could lead to local infection by...,0
1496,a69631.txt,Its called procrastination. You have to break...,1
1741,ans88.txt,Although eating raw rice may not necessarily c...,0
1364,a69497.txt,is your name mary if not then no hehehehe derr...,1
947,ans1851.txt,Your symptoms of abdominal cramps could be cau...,0
1747,ans885.txt,Most cases of breast tenderness is usually rel...,0
1598,a7386.txt,I do,1
899,a54305.txt,"Plenty of fluids and rest. OJ, Water, Soups,...",1
1296,a69428.txt,I know people who have athritis and it is supp...,1


In [8]:
df['class'].value_counts()

0    1874
1    1787
Name: class, dtype: int64

What we have here is a pretty balanced dataset so hopefully we should be getting good results without complicating our model too much

Let's divide our data into 2 parts - Train and Validation sets

# Splitting Data

In [10]:
X = df['text']
y = df['class']

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
X_train.head()

439     The most common cause of itchy palms is contac...
720     1. It takes two to make the money to support t...
307     there isnt always signs\nso do not do it until...
87      Speed is an amphetamine, a psychostimulant, wh...
1066                                         tantric sex 
Name: text, dtype: object

# Custom Tokenizer Function

A custom tokenizer function - This is where we set in the rules for which we consider words into the corpus.

**Note** : Documentation  is included for the other arguments you can give for this function. This combination seems to work best and it makes intuitive sense

In [11]:
def custom_tokenizer(doc):

    # clean up text
    tokens = [token.lemma_.lower() # lemmatize and lower-case 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are 2 or more characters long
                                    #token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve specific pos
                                    #token.text in nlp.vocab and # check if token in vocab
                                    token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop # get rid of tokens that are stop words
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

# Preprocessing using SpacY

Using SpacY's medium size dictionary and with the help of our custom tokenizer, we extract the words after stemming

In [13]:
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
nlpcorpus = nlp.pipe(X_train)
clean_corpus = [custom_tokenizer(doc) for doc in nlpcorpus]

In [17]:
import random

random.sample(clean_corpus,5)

['depend variable time month correlation girl period cycle condition male female etc healthy adult depend duration frequency time thing odd low',
 'answer list thread complete answer provide source section link website deal bipolar disorder diease formally know manic depressive disorder bipolar people exhibit sign manic depressiviness disorder eventually seperate distinct bipolar simply form depressive state person dramatic mood swing long short period time people suffer bipolar disorder fine week month sudden turn major depressive state self destructive suicidal nature week month road fine bipolar disorder chemeical imbalance brain seritonin chemical inihibit natural ability control impulse deprssive states condition respond medication therapy not worried fine job supportive friend matter website information bipolar disorder',
 'think depressed need nature way tell need not kinda odd not think depress choice say have world will not bad mistake away take throw people include little thi

Above we can see a random sample of 5 answers after our processing on it. It seems to be good but let's see how it performs on our model

# Building the Model

In [18]:
X_train = pd.Series(clean_corpus,index=X_train.index)
X_train

439     common cause itchy palm contact dermatitis exp...
720     take money support famlys need like food shelt...
307                      not sign married doctor approval
87      speed amphetamine psychostimulant commonly abu...
1066                                          tantric sex
                              ...                        
889                  week body makeover need info mail ok
905     father factor produce symptom generally patien...
1096    fiberoptic endoscopy upper gi tract investigat...
235     atopic dermatitis eczema chronic inflammatory ...
1061    endoscopic sinus surgery procedure choice remo...
Length: 2928, dtype: object

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

We use a pipeline here. Steps
1. First step is to create a TF_IDF matrix 
2. Second, we feed this data to a Multinomial Naive Bayes Classifier

In [20]:
clf = Pipeline(steps=[('tf',TfidfVectorizer()),\
                      ('nbc',MultinomialNB())])

Grid Search to perform hyper-parameter optimization

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {'tf__sublinear_tf' : [True,False], 'tf__smooth_idf' : [True,False], 'tf__norm' : ['l1','l2'],\
              'nbc__alpha' : [1.0,1.2,1.4,1.6]}
gscv = GridSearchCV(clf,param_grid,cv=4,return_train_score=False)

In [22]:
gscv.fit(X_train,y_train)

gscv.best_estimator_

Pipeline(memory=None,
         steps=[('tf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nbc',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

Our best training accuracy

In [23]:
gscv.best_score_

0.9262295081967213

Best set of training parameters

In [25]:
gscv.best_params_

{'nbc__alpha': 1.0,
 'tf__norm': 'l2',
 'tf__smooth_idf': True,
 'tf__sublinear_tf': False}

Now let's use the best estimator model of the test data

In [26]:
preds = gscv.best_estimator_.predict(X_test)

In [27]:
from sklearn import metrics
metrics.accuracy_score(y_test,preds)

0.9167803547066848

We get an accuracy of 92% , which is really good. But then again our dataset was well balanced too

In [28]:
metrics.confusion_matrix(y_test,preds)

array([[341,  34],
       [ 27, 331]], dtype=int64)

In [29]:
print(metrics.classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       375
           1       0.91      0.92      0.92       358

    accuracy                           0.92       733
   macro avg       0.92      0.92      0.92       733
weighted avg       0.92      0.92      0.92       733



Our classification report and confusion matrix also indicate that we have good precision, recall and f-1 scores as well.