In [1]:
import pandas as pd

### Step 1 : Reading data into a DataFrame

In [2]:
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfnonpro = data2df('HealthProNonPro/NonPro/', 0) # Pro
dfpro = data2df('HealthProNonPro/Pro/', 1) # Non-Pro

df = pd.concat([dfnonpro, dfpro], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
1749,ans887.txt,Most cases of fracture of clavicle are managed...,1
396,ans1355.txt,The heavy bleeding that you are experiencing c...,1
867,a54273.txt,Eat wheat bread instead of white. Eat carrots ...,0
1041,a61341.txt,i find surfing for naughty pictures and videos...,0
712,a31636.txt,I ALWAYS GET RID OF MINE BY HOLDING MY BREATH ...,0
558,a24790.txt,"Cuz they are hungry. Only females bite people,...",0
1281,a69413.txt,Good luck m8 and get well soon but watch out f...,0
1844,ans972.txt,Palpitations are the sensation that your heart...,1
490,a24722.txt,"Good question! As for myself, Cant sleep due t...",0
54,a24286.txt,"Just lie about it-- but wait, youre doing that...",0


### Step 2: Setting up the data for Training/Testing. Using20% for testing.

In [3]:
X, y = df['text'], df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [4]:
Xtrain.head()

439     well how old is he? just to be safe i would ta...
807     What you may be experiencing is post-concussio...
394     The healing time after the trauma or muscle co...
87                 I HAVE AN ELLIPITICAL AND I DO 30 MIN 
1153    A urinary test for pregnancy should be positiv...
Name: text, dtype: object

### Step 3: Using Spacy to preprocess the data.

In [5]:
import spacy

# once you’ve downloaded and installed a model, you can load it via spacy.load(). 
# spacy.load() returns a Language object containing all components and data needed to process text. \
# the Language object is typically called nlp. 
nlp = spacy.load("en_core_web_md") 

In [6]:

corpus = Xtrain

In [7]:

def custom_tokenizer(doc):

    # clean up text
    tokens = [token.lemma_.lower() # lemmatize and lower-case 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are 2 or more characters long
                                    #token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve specific pos
                                    #token.text in nlp.vocab and # check if token in vocab
                                    #token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop # get rid of tokens that are stop words
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
nlpcorpus = nlp.pipe(corpus)
clean_corpus = [custom_tokenizer(text) for text in nlpcorpus]
Xtrain = pd.Series(clean_corpus)
Xtrain.head()

0    old safe er soon possible 106 temp incredibaly...
1    experience post concussion syndrome complex di...
2    healing time trauma muscle contusion vary pati...
3                                   ellipitical 30 min
4    urinary test pregnancy positive pregnancy week...
dtype: object

### Step 4: Setting up a Pipeline with TfidfVectorizer and Naïve Bayes. 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


clf = Pipeline(steps=[('tfidf', TfidfVectorizer()),
                      ('nb', MultinomialNB())])

clf = clf.fit(Xtrain,ytrain)


### Step 5: Grid Search with 4-fold Cross Validation to search for the best values for the following two hyper-parameters (and any additional hyper parameters you may want to tune):1)	sublinear_tf in TfidfVectorizer	2)alpha in Naïve Bayes 


In [9]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'tfidf__sublinear_tf': (True, False), 
    'nb__alpha':[0.1,0.25,0.5,1] 
}
gscv = GridSearchCV(clf, param_grid, cv=4)


In [10]:
gscv.fit(Xtrain,ytrain)


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [11]:
print ("-"*100)
print(gscv.best_estimator_, "\n")
print ("-"*100)
print(gscv.best_score_, "\n")
print ("-"*100)
print(gscv.best_params_, "\n")
print ("-"*100)
print(gscv.cv_results_, "\n")
print ("-"*100)

----------------------------------------------------------------------------------------------------
Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=0.1, class

### 6)	Using the Best Estimator resulting from the Grid Search for Prediction/Evaluation. Printing the following evaluation metrics:Accuracy score, Confusion matrix, Classification report


In [12]:
###doing preprocessing on test data
corpus = Xtest
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
nlpcorpus = nlp.pipe(corpus)
clean_corpus = [custom_tokenizer(text) for text in nlpcorpus]
Xtest = pd.Series(clean_corpus)
Xtest.head()

0    presence risk factor sexual history uncertaint...
1    50/50 dude not especially be not ready kid kno...
2    store like gnc shake power bar people try gain...
3    description correspond neuralgia sharp pain fo...
4    exhaustion reduced drive present menopause dep...
dtype: object

In [13]:
##using the best estimators from grid search

ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.9440654843110505
[[326  37]
 [  4 366]]
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       363
           1       0.91      0.99      0.95       370

    accuracy                           0.94       733
   macro avg       0.95      0.94      0.94       733
weighted avg       0.95      0.94      0.94       733



### Step 7:	Extracting the true negatives (TN), false positives (FP), false negatives (FN), and true positives (TP) using the following command. TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()Then, using TN/FP/FN/TP  calculating the overall accuracy, the precision (for class 0 and class 1), the recall (for class 0 and class 1), and the f1-score (for class 0 and class 1). These should match what you are seeing in the accuracy_score and classification_report you printed above. 


In [14]:

TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()

In [15]:
Accuracy_score= (TN+TP)/(TN+TP+FP+FN)
Accuracy_score

0.9440654843110505

In [16]:
Precision_class_0= TN/(TN+FN)
Precision_class_0

0.9878787878787879

In [17]:
Precision_class_1= TP/(TP+FP)
Precision_class_1

0.9081885856079405

In [18]:
Recall_class_0= TN/(TN+FP)
Recall_class_0

0.8980716253443526

In [19]:
Recall_class_1= TP/(FN+TP)
Recall_class_1

0.9891891891891892

In [20]:
F1_score_class_0=(2*Precision_class_0*Recall_class_0)/(Precision_class_0+Recall_class_0)
F1_score_class_0

0.9408369408369409

In [21]:
F1_score_class_1=(2*Precision_class_1*Recall_class_1)/(Precision_class_1+Recall_class_1)
F1_score_class_1

0.9469598965071152