In [2]:
import pandas as pd
import re #re stands for the regular expression module, which provides a set of tools for matching and manipulating text patterns.
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
# Loading the dataset
data = pd.read_csv('data_after_EDA.csv')
# Print the first 5 rows of the dataframe.
data.head()

Unnamed: 0,ROWNUM,Hospital,Eligibility Class,Gender,Arrival Time,Severity Level,Deparment,Main Diagnosis,Discharge Time,Waiting Time (Minutes),Length of Stay (Minutes),Treatment Time(Minutes),Cluster,No Treatment
0,1,Royal Commission Health Services Program,ROYAL COMMISSION,Female,2023-12-13 13:17:48,Level Ⅳ,Emergency Medicine,"Pain, unspecified",2023-12-13 16:43:00,14.0,205.0,191.0,2,0
1,2,Royal Commission Health Services Program,ROYAL COMMISSION,Female,2023-12-08 10:59:28,Level Ⅲ,Emergency Medicine,Low back pain,2023-12-08 12:50:00,7.0,111.0,104.0,1,0
2,3,Royal Commission Health Services Program,ROYAL COMMISSION,Female,2023-11-05 14:03:02,Level Ⅲ,Emergency Medicine,"Acute upper respiratory infection, unspecified",2023-11-05 14:54:00,24.0,51.0,27.0,1,0
3,4,Royal Commission Health Services Program,ROYAL COMMISSION,Female,2023-10-07 22:57:41,Level Ⅲ,Emergency Medicine,Epistaxis,2023-10-08 00:09:00,26.0,71.0,0.0,1,1
4,5,Royal Commission Health Services Program,ROYAL COMMISSION,Female,2023-10-21 21:32:17,Level Ⅳ,Emergency Medicine,"Acute upper respiratory infection, unspecified",2023-10-21 23:10:00,56.0,98.0,42.0,0,0


In [5]:

df = pd.DataFrame(data, columns=['Main Diagnosis'])

# Step 1: Lowercase all the entries in the 'Main Diagnosis' column
df['Main Diagnosis'] = df['Main Diagnosis'].str.lower()

# Step 2: Handle missing values by replacing NaN or empty strings with "Unknown"
df['Main Diagnosis'] = df['Main Diagnosis'].fillna('Unknown')

# Step 3: Remove punctuation or special characters from the diagnoses
df['Main Diagnosis'] = df['Main Diagnosis'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Step 4: Standardize some common diagnoses (as an example)
df['Main Diagnosis'] = df['Main Diagnosis'].replace({
    'acute upper respiratory infection unspecified': 'upper respiratory infection',
    'low back pain': 'back pain',
   
})

print(df['Main Diagnosis'])



0                                         pain unspecified
1                                                back pain
2                              upper respiratory infection
3                                                epistaxis
4                              upper respiratory infection
                               ...                        
97085                                   asthma unspecified
97086                          upper respiratory infection
97087                          upper respiratory infection
97088    cutaneous abscess furuncle and carbuncle unspe...
97089                          pain in limb multiple sites
Name: Main Diagnosis, Length: 97090, dtype: object


In [6]:
from tqdm import tqdm
import spacy # to visualiza the process

In [7]:
#Tokenizer
# Function to apply spaCy processing and tokenize the Main Diagnosis column
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Apply spaCy tokenizer to the 'Main Diagnosis' column
tqdm.pandas()  
df['Main Diagnosis Tokens'] = df['Main Diagnosis'].progress_apply(lambda x: spacy_tokenizer(str(x).lower()))

# Display the tokenized diagnosis column
print(df[['Main Diagnosis', 'Main Diagnosis Tokens']])



100%|██████████| 97090/97090 [09:30<00:00, 170.05it/s]

                                          Main Diagnosis  \
0                                       pain unspecified   
1                                              back pain   
2                            upper respiratory infection   
3                                              epistaxis   
4                            upper respiratory infection   
...                                                  ...   
97085                                 asthma unspecified   
97086                        upper respiratory infection   
97087                        upper respiratory infection   
97088  cutaneous abscess furuncle and carbuncle unspe...   
97089                        pain in limb multiple sites   

                                   Main Diagnosis Tokens  
0                                    [pain, unspecified]  
1                                           [back, pain]  
2                        [upper, respiratory, infection]  
3                                          




In [8]:
# Remove the stop words
#  Load spaCy stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Function to remove stop words from the tokens
def remove_stop_words(tokens):
    return [token for token in tokens if token not in stop_words]

# Apply tqdm to monitor the process
tqdm.pandas(desc="Removing Stop Words")

# Apply the stop word removal function with a progress bar
df['Main Diagnosis Tokens Without Stopwords'] = df['Main Diagnosis Tokens'].progress_apply(remove_stop_words)

# Display the updated tokenized diagnosis column without stop words
print(df[['Main Diagnosis', 'Main Diagnosis Tokens Without Stopwords']])

Removing Stop Words: 100%|██████████| 97090/97090 [00:00<00:00, 327432.86it/s]

                                          Main Diagnosis  \
0                                       pain unspecified   
1                                              back pain   
2                            upper respiratory infection   
3                                              epistaxis   
4                            upper respiratory infection   
...                                                  ...   
97085                                 asthma unspecified   
97086                        upper respiratory infection   
97087                        upper respiratory infection   
97088  cutaneous abscess furuncle and carbuncle unspe...   
97089                        pain in limb multiple sites   

                 Main Diagnosis Tokens Without Stopwords  
0                                    [pain, unspecified]  
1                                                 [pain]  
2                        [upper, respiratory, infection]  
3                                          




In [25]:
from tqdm import tqdm

# Enable progress bar for pandas
tqdm.pandas(desc="Lemmatizing Tokens")

# Function to apply spaCy lemmatization to tokens
def lemmatize_tokens(tokens):
    doc = nlp(" ".join(tokens))  # Process the tokens into a single string
    return [token.lemma_ for token in doc]  # Return the lemmatized tokens

# Apply the lemmatization function with a progress bar
df['Main Diagnosis Lemmatized'] = df['Main Diagnosis Tokens Without Stopwords'].progress_apply(lemmatize_tokens)

# Display the lemmatized tokens
print(df[['Main Diagnosis', 'Main Diagnosis Lemmatized']])


Lemmatizing Tokens: 100%|██████████| 97090/97090 [10:10<00:00, 159.00it/s]

                                          Main Diagnosis  \
0                                       pain unspecified   
1                                              back pain   
2                            upper respiratory infection   
3                                              epistaxis   
4                            upper respiratory infection   
...                                                  ...   
97085                                 asthma unspecified   
97086                        upper respiratory infection   
97087                        upper respiratory infection   
97088  cutaneous abscess furuncle and carbuncle unspe...   
97089                        pain in limb multiple sites   

                               Main Diagnosis Lemmatized  
0                                    [pain, unspecified]  
1                                                 [pain]  
2                        [upper, respiratory, infection]  
3                                          




In [29]:
import nltk
import heapq
import numpy as np

In [30]:


# 从 DataFrame 提取已经词形还原的诊断数据
dataset = df['Main Diagnosis Lemmatized']

# 创建词频字典
word2count = {}
for data in dataset:
    for word in data:  # 直接迭代已词形还原的单词列表
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

# 使用 heapq 找出出现频率最高的100个词
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

# 创建词袋模型的向量表示
X = []
for data in dataset:
    vector = []
    for word in freq_words:
        if word in data:  # 直接检查该词是否在已词形还原的列表中
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)
X = np.asarray(X)


In [31]:
# 将向量添加到 DataFrame
df['BoW_Vector'] = list(X)

# 打印更新后的 DataFrame 的部分内容来查看结果
print(df[['Main Diagnosis', 'Main Diagnosis Lemmatized', 'BoW_Vector']].head())


                Main Diagnosis        Main Diagnosis Lemmatized  \
0             pain unspecified              [pain, unspecified]   
1                    back pain                           [pain]   
2  upper respiratory infection  [upper, respiratory, infection]   
3                    epistaxis                      [epistaxis]   
4  upper respiratory infection  [upper, respiratory, infection]   

                                          BoW_Vector  
0  [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [35]:


# rewrite the result of nlp to new csv
vector_df = pd.DataFrame(df['BoW_Vector'].tolist(), columns=[f'word_{i}' for i in range(len(df['BoW_Vector'].iloc[0]))])

new_df = pd.concat([df, vector_df], axis=1)

new_df.to_csv('updated_dataset.csv', index=False)


In [36]:
print(new_df.head())

                Main Diagnosis            Main Diagnosis Tokens  \
0             pain unspecified              [pain, unspecified]   
1                    back pain                     [back, pain]   
2  upper respiratory infection  [upper, respiratory, infection]   
3                    epistaxis                      [epistaxis]   
4  upper respiratory infection  [upper, respiratory, infection]   

  Main Diagnosis Tokens Without Stopwords        Main Diagnosis Lemmatized  \
0                     [pain, unspecified]              [pain, unspecified]   
1                                  [pain]                           [pain]   
2         [upper, respiratory, infection]  [upper, respiratory, infection]   
3                             [epistaxis]                      [epistaxis]   
4         [upper, respiratory, infection]  [upper, respiratory, infection]   

  Main Diagnosis Lemmatized Text               Processed Text  \
0               pain unspecified             pain unspecified  