<a href="https://colab.research.google.com/github/Only-Mike/ADHD/blob/main/ADHD_Reddit_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
!git clone 'https://github.com/Only-Mike/ADHD.git'

Cloning into 'ADHD'...
remote: Enumerating objects: 831, done.[K
remote: Counting objects: 100% (348/348), done.[K
remote: Compressing objects: 100% (159/159), done.[K
remote: Total 831 (delta 212), reused 302 (delta 189), pack-reused 483[K
Receiving objects: 100% (831/831), 16.99 MiB | 14.29 MiB/s, done.
Resolving deltas: 100% (458/458), done.


In [4]:
df = pd.read_csv('/content/ADHD.csv', low_memory=False)


In [5]:
#Installing preprocessor to celan our text
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

# explainability (why did the model say it's related to this author)
!pip install eli5

[K     |████████████████████████████████| 24.1 MB 65.9 MB/s 
[K     |████████████████████████████████| 1.7 MB 4.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 4.1 MB/s 
Collecting jinja2>=3.0.0
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 53.5 MB/s 
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=1

In [6]:
import tqdm #progress bar
import preprocessor as prepro # text prepro

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


In [7]:
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [None]:
df.head()

In [8]:
#Take a random sample of 2000 papers. This is for making the model run faster.
df = df.sample(n=25000)

In [9]:
#Making a new column called "text" from the title column
df['text'] = df['title']

In [10]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  #Cleaning the text column
  texts_clean = texts.map(lambda t: prepro.clean(t))

  # run progress bar and clean up using spacy but without some heavy parts of the pipeline
  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [11]:
df['text_clean'] = text_prepro(df['text'])

100%|██████████| 25000/25000 [00:32<00:00, 779.28it/s] 


In [12]:
# preprocess texts
tokens = []

for summary in nlp.pipe(df['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [13]:
df['tokens'] = tokens

In [41]:
# Create a Dictionary from the posts: dictionary
dictionary = Dictionary(df['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words ----- 0.2 removes ADHD from dictionary
dictionary.filter_extremes(no_below=2, no_above=0.2, keep_n=1000)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in df['tokens']]

##Visualization

In [42]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)

In [43]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [44]:
pyLDAvis.display(lda_display)