<a href="https://colab.research.google.com/github/Only-Mike/ADHD/blob/main/ADHD_Reddit_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
!git clone 'https://github.com/Only-Mike/ADHD.git'

fatal: destination path 'ADHD' already exists and is not an empty directory.


In [3]:
df = pd.read_csv('/content/ADHD.csv', low_memory=False)


In [4]:
#Installing preprocessor to celan our text
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

# explainability (why did the model say it's related to this author)
!pip install eli5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import tqdm #progress bar
import preprocessor as prepro # text prepro

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


In [6]:
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [7]:
df.head()

Unnamed: 0,title,selftext,score,id,url,num_comments,created_utc,created_datetime
0,Android app to strengthen attention/focus,"Hey /r/ADHD,\n\nCheck out my simple Android ap...",6,k348a,https://www.reddit.com/r/ADHD/comments/k348a/a...,7.0,1315014480.0,2011-09-03 01:48:00
1,Does anyone here have experience with Imipramine?,My doctor has suggested it for ADD with anxiet...,4,k3gdz,https://www.reddit.com/r/ADHD/comments/k3gdz/d...,1.0,1315051084.0,2011-09-03 11:58:04
2,"New study shows that for people with ADHD, \n""...",,22,k4q79,http://www.utexas.edu/features/2011/08/29/adhd...,11.0,1315174374.0,2011-09-04 22:12:54
3,What does the ADHD test look like?,I'm 21 and didn't know about ADHD until recent...,6,k5fvd,https://www.reddit.com/r/ADHD/comments/k5fvd/w...,10.0,1315239683.0,2011-09-05 16:21:23
4,Are you guys good with maps and directions?,It seems like one thing I've always been reall...,0,k6efi,https://www.reddit.com/r/ADHD/comments/k6efi/a...,0.0,1315317191.0,2011-09-06 13:53:11


In [8]:
#Take a random sample of 2000 papers. This is for making the model run faster.
df = df.sample(n=5000)

In [9]:
#Making a new column called "text" from the title column
df['text'] = df['title']

In [10]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  #Cleaning the text column
  texts_clean = texts.map(lambda t: prepro.clean(t))

  # run progress bar and clean up using spacy but without some heavy parts of the pipeline
  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [11]:
df['text_clean'] = text_prepro(df['text'])

100%|██████████| 5000/5000 [00:05<00:00, 854.41it/s]


In [12]:
# preprocess texts
tokens = []

for summary in nlp.pipe(df['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [13]:
df['tokens'] = tokens

In [22]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(df['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=2, no_above=0.3, keep_n=800)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in df['tokens']]

##Visualization

In [23]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)

In [24]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [25]:
pyLDAvis.display(lda_display)