In [None]:
# %pip install pandas numpy matplotlib seaborn scikit-learn

# Dataset Building

In [4]:
import pandas as pd
import re

In [14]:
drug_df = pd.read_csv("../Datasets/Drug_related_tweets.csv")

drug_df = drug_df[["tweet"]]
drug_df.columns = ["text"]
drug_df["label"] = 1

print("Drug tweets:", len(drug_df))
display(drug_df.head(10))

Drug tweets: 53766


Unnamed: 0,text,label
0,Ive tried a few antidepressants over the years...,1
1,My son has Crohns disease and has done very we...,1
2,Quick reduction of symptoms,1
3,Contrave combines drugs that were used for alc...,1
4,I have been on this birth control for one cycl...,1
5,4 days in on first 2 weeks. Using on arms and...,1
6,Ive had the copper coil for about 3 months now...,1
7,This has been great for me. Ive been on it for...,1
8,Ive been on Methadone for over ten years and c...,1
9,I was on this pill for almost two years. It do...,1


In [None]:
medical_df = pd.read_csv("../Datasets/unique_medical_tweets_dataset.csv")

medical_df = medical_df[["text"]]

medical_df["label"] = 0

print("Medical tweets:", len(medical_df))
display(medical_df.head(10))


Medical tweets: 10000


Unnamed: 0,text,label
0,"Staying consistent with my workouts, it pays off.",0
1,This weather is perfect for a long run.,0
2,"I'm all about that gym life, workouts every day.",0
3,I can't believe I have fitness.,0
4,My headache symptoms are just unbearable.,0
5,"Staying consistent with my workouts, it pays off.",0
6,"Just had an amazing run, feeling so fit.",0
7,Just got diagnosed with diabetes. Anyone have ...,0
8,"Staying consistent with my workouts, it pays off.",0
9,Does anyone know good treatments for diabetes?,0


In [None]:
sentiment_df = pd.read_csv(
    "../Datasets/1.6-million-tweets-dataset.csv",
    encoding="latin-1",
    header=None
)

sentiment_df_0 = sentiment_df[sentiment_df[0] == 0]
sentiment_df_4 = sentiment_df[sentiment_df[0] == 4]
sentiment_df_0 = sentiment_df_0[[5]]
sentiment_df_0.columns = ["text"]
sentiment_df_4 = sentiment_df_4[[5]]
sentiment_df_4.columns = ["text"]

sentiment_df_0["label"] = 0
sentiment_df_4["label"] = 0

print("General tweets:", len(sentiment_df_0) , len(sentiment_df_4))
display(sentiment_df_0.head())
display(sentiment_df_4.head())


General tweets: 800000 800000


Unnamed: 0,text,label
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


Unnamed: 0,text,label
800000,I LOVE @Health4UandPets u guys r the best!!,0
800001,im meeting up with one of my besties tonight! ...,0
800002,"@DaRealSunisaKim Thanks for the Twitter add, S...",0
800003,Being sick can be really cheap when it hurts t...,0
800004,@LovesBrooklyn2 he has that effect on everyone,0


In [31]:
drug_df = drug_df.sample(20000, random_state=42)
medical_df = medical_df.sample(10000, random_state=42)
sentiment_df_0 = sentiment_df_0.sample(5000, random_state=42)
sentiment_df_4 = sentiment_df_4.sample(5000, random_state=42)

In [32]:
final_df = pd.concat([drug_df, medical_df, sentiment_df_0, sentiment_df_4], ignore_index=True)
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

final_df.to_csv("../Datasets/final_dataset_raw.csv", index=False)

final_df["label"].value_counts()


label
0    20000
1    20000
Name: count, dtype: int64

# DATA CLEANING & PREPROCESSING

In [None]:
# %pip install nltk

In [34]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swaga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swaga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = text.strip()

    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

    return " ".join(tokens)


In [40]:
df = pd.read_csv("../Datasets/final_dataset_raw.csv")

df["clean_text"] = df["text"].apply(clean_text)

df = df[df["clean_text"].str.len() > 3]

df.drop(columns=["text"], inplace=True)

df.rename(columns={"clean_text": "text"}, inplace=True)

df.to_csv("../Datasets/final_dataset_cleaned.csv", index=False)

df.head()


Unnamed: 0,label,text
0,0,like l word best tv show ever mean continue wt...
1,1,hiv since refused treatment till felt fine abs...
2,0,headache symptom unbearable
3,1,started brintellix mg depressionanxiety three ...
4,0,anyone else suffering headache lately


In [41]:
df["label"].value_counts()

label
1    19995
0    19911
Name: count, dtype: int64