In [None]:
# Colab preprocessing for toronto_air_pollution_and_covid-19.csv
# Output: ready_for_viz.csv (saved back to Google Drive)

# 0) Mount Google Drive (Colab)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Colab setup
!pip -q install spacy nltk scikit-learn pandas numpy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m112.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# 1) Imports
import os
import re
import numpy as np
import pandas as pd

# 2) Paths — EDIT THESE AS NEEDED
DATA_DIR = "/content/drive/My Drive/Data Visualization"  # <-- change to your folder
INPUT_CSV = "Datathon2_drug_review.csv"  # <-- confirm the file name
OUTPUT_CSV = "ready_for_viz_2.csv"

in_path = os.path.join(DATA_DIR, INPUT_CSV)
out_path = os.path.join(DATA_DIR, OUTPUT_CSV)

In [None]:
import nltk
nltk.download('vader_lexicon')

import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
nlp = spacy.load("en_core_web_sm")

# For reproducibility
RANDOM_STATE = 13

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
df = pd.read_csv(in_path)
# Basic cleanup
df = df.rename(columns=str.strip)
df = df.dropna(subset=['rating'])  # ensure rating exists
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [None]:
all_conditions = set(df['condition'].dropna().unique())
print(f"Total unique conditions: {len(all_conditions)}")
for cond in sorted(all_conditions):
    print(cond)

In [None]:
# 1) Keywords to flag mental-health conditions (edit/extend as needed)
mh_keywords = [
    # core diagnoses
    "depress", "anxiety", "bipolar", "schizo", "psychosis", "psychotic",
    "ptsd", "ocd", "adhd", "add", "autism", "asperger", "panic",
    "insomnia", "sleep", "manic", "mania", "mood", "pmdd", "pms",
    "borderline", "personality disorder", "eating", "anorexia", "bulimia",
    "substance", "alcohol", "opiate", "opioid", "addiction", "dependence",
    "withdrawal", "tourette", "tics",
    # symptoms/states often used as “condition” labels in reviews
    "agitation", "restless", "nervous", "fear", "phobia",
    "stress", "trauma"
]

# 2) Build the matched mental-health condition set (to review first)
def find_keyword(cond):
    for k in mh_keywords:
        if k in cond:
            return k   # first matching keyword
    return None

df['keyword'] = df['condition_clean'].apply(find_keyword)

# Keep only rows where a keyword matched
df_mh = df[df['keyword'].notna()].copy()

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

custom_stops = {
    # add domain-specific stops here
    "covid", "coronavirus"
}
STOP_WORDS |= custom_stops

def clean_text(text):
    if pd.isna(text):
        return ""
    # normalize
    t = text.lower()
    t = re.sub(r"(f|ht)tp\S+\s*", " ", t)              # URLs
    t = re.sub(r"@[A-Za-z0-9_]+", " ", t)              # at handles
    t = re.sub(r"#[A-Za-z0-9_]+", " ", t)              # hashtags
    t = re.sub(r"[^\x00-\x7F]+", " ", t)               # emojis/non-ascii
    t = re.sub(r"[^a-z\s]", " ", t)                    # keep letters/spaces
    t = re.sub(r"\s+", " ", t).strip()

    doc = nlp(t)
    toks = []
    for tok in doc:
        if tok.is_stop:
            continue
        lemma = tok.lemma_.strip()
        if len(lemma) >= 3:
            toks.append(lemma)
    return " ".join(toks)

for col in ['benefitsReview','sideEffectsReview','commentsReview']:
    if col in df_mh.columns:
        df_mh[f'{col}_clean'] = df_mh[col].apply(clean_text)

In [None]:
df_mh.to_csv(out_path, index=False)
print(f"Saved: {out_path} with {len(df_mh)} rows and {df_mh.shape[1]} columns.")

Saved: /content/drive/My Drive/Data Visualization/ready_for_viz_2.csv with 1092 rows and 12 columns.


In [22]:
df_mh

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview,benefitsReview_clean,sideEffectsReview_clean,commentsReview_clean
1,3724,lamictal,9,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...,lamictal stabilize mood swing minute claw wall...,drowsiness bit mental numbness feel sedated ab...,severe mood swing hypomania depression suicide...
2,3824,depakene,4,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...,initial benefit comparable brand version drug ...,depakene thin coating cause severe heart burn ...,depakote prescribe kaiser psychiatrist pleasan...
3,969,sarafem,10,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...,controll mood swing help think act speak contr...,not notice effect,drug wonderful make totally different person w...
8,2576,klonopin,10,Highly Effective,No Side Effects,panic disorder,I immediately stopped having panic attacks. I...,I experienced no side effects. I was not tire...,I started both klonopin and prozac together. ...,immediately stop have panic attack miraculous ...,experience effect tired feel medicated,start klonopin prozac take klonopin time day g...
9,1093,effexor,1,Marginally Effective,Extremely Severe Side Effects,depression,the presumed benefits were to help with a seve...,here we go.the initial effect would be dry mou...,family doctor initially prescribed wellbutin b...,presume benefit help severe depression beginni...,initial effect dry mouth sleepiness able think...,family doctor initially prescribe wellbutin ps...
...,...,...,...,...,...,...,...,...,...,...,...,...
4121,801,ambien,7,Moderately Effective,No Side Effects,sleep,to get restful sleep,"sleepiness, drowiness, etc",swallow the pill with water before bed. I was ...,restful sleep,sleepiness drowiness etc,swallow pill water bed ambient wake middle nig...
4130,539,trazodone,10,Highly Effective,No Side Effects,chronic severe insomnia,This medication was the only thing that was ab...,There were no side effects.,I usually took 150 mg. per night. Occasionall...,medication thing able sleep regularly year chr...,effect,usually take night occasionally trouble fall a...
4135,2613,adderall-xr,8,Considerably Effective,Moderate Side Effects,adhd,The medication helped the patient focus on dai...,"The patient experienced some side-effects, inc...",The patient was diagnosed with ADHD after desc...,medication help patient focus daily activity t...,patient experience effect include dry mouth in...,patient diagnose adhd describe pattern focus m...
4138,1039,vyvanse,10,Highly Effective,Mild Side Effects,adhd,"Increased focus, attention, productivity. Bett...","Restless legs at night, insomnia, headache (so...","I took adderall once as a child, and it made m...",increase focus attention productivity well abl...,restless leg night insomnia headache pretty ba...,take adderall child miserable want concerta ad...
