In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from wordcloud import WordCloud
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load structured clinical data
structured_data = pd.DataFrame({
    'PatientID': [1, 2, 3, 4, 5],
    'Age': [34, 45, 23, 50, 40],
    'Gender': ['M', 'F', 'F', 'M', 'M'],
    'Diagnosis': ['Diabetes', 'Hypertension', 'Asthma', 'Diabetes', 'Hypertension']
})
print(structured_data.head())

# Load unstructured clinical data (clinical notes)
unstructured_data = [
    "Patient has a history of hypertension and diabetes. Prescribed medication X.",
    "Asthma diagnosis confirmed. Patient advised to use inhaler daily.",
    "Hypertension patient. Needs regular monitoring of blood pressure.",
    "Diabetes patient. Recommended diet and exercise.",
    "Patient diagnosed with hypertension. Medication Y prescribed."
]
structured_data.head()

# Check for missing values
structured_data.isnull().sum()

# Encode categorical variables
structured_data['Gender'] = structured_data['Gender'].map({'M': 0, 'F': 1})
print(structured_data.head())
structured_data = pd.get_dummies(structured_data, columns=['Diagnosis'])
print(structured_data.head())
structured_data.head()

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Apply preprocessing to clinical notes
cleaned_notes = [preprocess_text(note) for note in unstructured_data]
print(cleaned_notes)
"""
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10)
X_tfidf = vectorizer.fit_transform(cleaned_notes).toarray()
pd.DataFrame(X_tfidf, columns=vectorizer.get_feature_names_out()).head()

# Apply PCA to structured data
pca = PCA(n_components=2)
structured_data_pca = pca.fit_transform(structured_data.drop('PatienxtID', axis=1))

# Plot PCA results
plt.figure(figsize=(8, 6))
plt.scatter(structured_data_pca[:, 0], structured_data_pca[:, 1], c='blue', marker='o')
plt.title('PCA of Structured Clinical Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

# Apply KMeans clustering to text data
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_tfidf)

# Plot word cloud for each cluster
for i in range(2):
    cluster_words = ' '.join([cleaned_notes[j] for j in range(len(cleaned_notes)) if kmeans.labels_[j] == i])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_words)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Cluster {i}')
    plt.show()
"""

   PatientID  Age Gender     Diagnosis
0          1   34      M      Diabetes
1          2   45      F  Hypertension
2          3   23      F        Asthma
3          4   50      M      Diabetes
4          5   40      M  Hypertension
   PatientID  Age  Gender     Diagnosis
0          1   34       0      Diabetes
1          2   45       1  Hypertension
2          3   23       1        Asthma
3          4   50       0      Diabetes
4          5   40       0  Hypertension
   PatientID  Age  Gender  Diagnosis_Asthma  Diagnosis_Diabetes  \
0          1   34       0             False                True   
1          2   45       1             False               False   
2          3   23       1              True               False   
3          4   50       0             False                True   
4          5   40       0             False               False   

   Diagnosis_Hypertension  
0                   False  
1                    True  
2                   False  
3          

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sailesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"\n# Vectorize the text using TF-IDF\nvectorizer = TfidfVectorizer(max_features=10)\nX_tfidf = vectorizer.fit_transform(cleaned_notes).toarray()\npd.DataFrame(X_tfidf, columns=vectorizer.get_feature_names_out()).head()\n\n# Apply PCA to structured data\npca = PCA(n_components=2)\nstructured_data_pca = pca.fit_transform(structured_data.drop('PatienxtID', axis=1))\n\n# Plot PCA results\nplt.figure(figsize=(8, 6))\nplt.scatter(structured_data_pca[:, 0], structured_data_pca[:, 1], c='blue', marker='o')\nplt.title('PCA of Structured Clinical Data')\nplt.xlabel('Principal Component 1')\nplt.ylabel('Principal Component 2')\nplt.grid(True)\nplt.show()\n\n# Apply KMeans clustering to text data\nkmeans = KMeans(n_clusters=2, random_state=0).fit(X_tfidf)\n\n# Plot word cloud for each cluster\nfor i in range(2):\n    cluster_words = ' '.join([cleaned_notes[j] for j in range(len(cleaned_notes)) if kmeans.labels_[j] == i])\n    wordcloud = WordCloud(width=800, height=400, background_color='white').g