In [11]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Load the dataset
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Fiverr/ak89555/Covid_Clinical_Notes_Data.xlsx')

In [3]:
data

Unnamed: 0,ID,Text
0,1,Right side of epiglottis swelled up and hinder...
1,2,Approximately 30 min post vaccination administ...
2,3,"About 15 minutes after receiving the vaccine, ..."
3,4,"extreme fatigue, dizziness,. could not lift my..."
4,5,"Injection site swelling, redness, warm to the ..."
...,...,...
146617,146618,"10 days after receiving the Janssen vaccine, p..."
146618,146619,2-3 days after receiving the Johnson and Johns...
146619,146620,"headache, chills, and body aches begin on 4/12..."
146620,146621,Acute DVT




In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146622 entries, 0 to 146621
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      146622 non-null  int64 
 1   Text    146571 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.2+ MB


In [19]:
data.isnull().sum()

ID       0
Text    51
dtype: int64

# Data Preprocessing

In [20]:
# Drop rows with missing values in the 'Text' column
data.dropna(subset=['Text'], inplace=True)

In [21]:
# Pre-processing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [22]:
# Apply pre-processing to each clinical note
data['Processed_Text'] = data['Text'].apply(preprocess_text)

In [24]:
data['Processed_Text']

0         [right, side, epiglottis, swelled, hinder, swa...
1         [approximately, 30, min, post, vaccination, ad...
2         [15, minute, receiving, vaccine, patient, comp...
3         [extreme, fatigue, dizziness, could, lift, lef...
4         [injection, site, swelling, redness, warm, tou...
                                ...                        
146617    [10, day, receiving, janssen, vaccine, patient...
146618    [23, day, receiving, johnson, johnson, vaccine...
146619    [headache, chill, body, ache, begin, 412, chil...
146620                                         [acute, dvt]
146621    [41021day, shot, small, headache, tired, 41321...
Name: Processed_Text, Length: 146571, dtype: object

# Topic modeling

In [25]:
# Create dictionary and corpus for topic modeling
dictionary = corpora.Dictionary(data['Processed_Text'])
corpus = [dictionary.doc2bow(text) for text in data['Processed_Text']]

# Topic modeling using LDA
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

# Print topics
print("Topics:")
for idx, topic in lda_model.print_topics(-1):
    print('Topic {}: {}'.format(idx, topic))

Topics:
Topic 0: 0.055*"patient" + 0.039*"reported" + 0.029*"covid19" + 0.024*"vaccine" + 0.023*"experienced" + 0.022*"received" + 0.019*"dose" + 0.019*"subject" + 0.018*"report" + 0.016*"number"
Topic 1: 0.025*"patient" + 0.013*"vaccine" + 0.011*"blood" + 0.007*"heart" + 0.007*"day" + 0.006*"felt" + 0.006*"hospital" + 0.006*"pressure" + 0.005*"time" + 0.005*"minute"
Topic 2: 0.039*"pain" + 0.028*"headache" + 0.024*"fever" + 0.021*"day" + 0.020*"chill" + 0.019*"body" + 0.018*"arm" + 0.017*"injection" + 0.017*"ache" + 0.013*"site"


# Side effects

In [26]:
# Extract side effects
side_effects = []
# Define a list of common side effect keywords
side_effect_keywords = ['swelling', 'pain', 'chest tightness', 'itchy', 'fatigue', 'dizziness',
                        'redness', 'warm to the touch', 'chills', 'fever', 'abdominal pain',
                        'diarrhea', 'rash', 'hives']

# Identify side effects mentioned in the clinical notes
for note in data['Text']:
    for keyword in side_effect_keywords:
        if keyword in note:
            side_effects.append(keyword)

# Count the occurrences of each side effect
side_effect_counts = pd.Series(side_effects).value_counts()

# Print side effects from most frequent to least frequent
print("\nSide Effects (from most frequent to least frequent):")
print(side_effect_counts)


Side Effects (from most frequent to least frequent):
pain                 33093
fever                25035
chills               23146
fatigue              16568
swelling             12188
rash                 11386
redness               8972
itchy                 7733
dizziness             7317
diarrhea              4580
hives                 3664
abdominal pain        1338
chest tightness       1078
warm to the touch      941
dtype: int64
