In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import joblib
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
from imblearn.over_sampling import SMOTE   
from sklearn.decomposition import PCA


In [2]:
data = pd.read_csv('./mtsamples.csv') 


In [3]:
import nltk

data['transcription'] = data['transcription'].astype(str)

# Extract the 'transcription' column from the DataFrame
sentences = data['transcription'].tolist()

# Print the sentences before joining
print(f"Sentences before joining: {sentences}")

# Concatenate all sentences into a single string
all_text = ' '.join(sentences)

# Print the concatenated text
print(f"All text: {all_text}")

# Tokenize the entire text
tokenized_sentences = nltk.sent_tokenize(all_text)

# Limit the output to the first 20 tokenized sentences
tokenized_sentences = tokenized_sentences[:20]

# Print the tokenized sentences
print(f"Tokenized Sentences (limited to 20): {tokenized_sentences}")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Tokenized Sentences (limited to 20): ['SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.', 'She used to have allergies when she lived in Seattle but she thinks they are worse here.', 'In the past, she has tried Claritin, and Zyrtec.', 'Both worked for short time but then seemed to lose effectiveness.', 'She has used Allegra also.', 'She used that last summer and she began using it again two weeks ago.', 'It does not appear to be working very well.', 'She has used over-the-counter sprays but no prescription nasal sprays.', 'She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.', 'Nasal mucosa was erythematous and swollen.', 'Only clear drainage was

In [4]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [5]:
print(data.head())

   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   
3           3                             2-D M-Mode. Doppler.     
4           4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL 

In [6]:
!pip install matplotlib


Defaulting to user installation because normal site-packages is not writeable


In [7]:
# Tokenize each sentence into words
tokenized_words = [nltk.word_tokenize(sentence) for sentence in tokenized_sentences]

# Display the tokenized words for each sentence
for i, words in enumerate(tokenized_words):
    print(f"Words in Sentence {i + 1}: {words}")


Words in Sentence 1: ['SUBJECTIVE', ':', ',', 'This', '23-year-old', 'white', 'female', 'presents', 'with', 'complaint', 'of', 'allergies', '.']
Words in Sentence 2: ['She', 'used', 'to', 'have', 'allergies', 'when', 'she', 'lived', 'in', 'Seattle', 'but', 'she', 'thinks', 'they', 'are', 'worse', 'here', '.']
Words in Sentence 3: ['In', 'the', 'past', ',', 'she', 'has', 'tried', 'Claritin', ',', 'and', 'Zyrtec', '.']
Words in Sentence 4: ['Both', 'worked', 'for', 'short', 'time', 'but', 'then', 'seemed', 'to', 'lose', 'effectiveness', '.']
Words in Sentence 5: ['She', 'has', 'used', 'Allegra', 'also', '.']
Words in Sentence 6: ['She', 'used', 'that', 'last', 'summer', 'and', 'she', 'began', 'using', 'it', 'again', 'two', 'weeks', 'ago', '.']
Words in Sentence 7: ['It', 'does', 'not', 'appear', 'to', 'be', 'working', 'very', 'well', '.']
Words in Sentence 8: ['She', 'has', 'used', 'over-the-counter', 'sprays', 'but', 'no', 'prescription', 'nasal', 'sprays', '.']
Words in Sentence 9: ['S

In [8]:
from nltk.stem import PorterStemmer
import pandas as pd
from IPython.display import display
# Create a PorterStemmer instance
porter_stemmer = PorterStemmer()

# Apply stemming to each word in each sentence
stemmed_words = [[porter_stemmer.stem(word) for word in words] for words in tokenized_words]

# Display the stemmed words for each sentence
for i, words in enumerate(stemmed_words):
    print(f"Stemmed Words in Sentence {i + 1}: {words}")


Stemmed Words in Sentence 1: ['subject', ':', ',', 'thi', '23-year-old', 'white', 'femal', 'present', 'with', 'complaint', 'of', 'allergi', '.']
Stemmed Words in Sentence 2: ['she', 'use', 'to', 'have', 'allergi', 'when', 'she', 'live', 'in', 'seattl', 'but', 'she', 'think', 'they', 'are', 'wors', 'here', '.']
Stemmed Words in Sentence 3: ['in', 'the', 'past', ',', 'she', 'ha', 'tri', 'claritin', ',', 'and', 'zyrtec', '.']
Stemmed Words in Sentence 4: ['both', 'work', 'for', 'short', 'time', 'but', 'then', 'seem', 'to', 'lose', 'effect', '.']
Stemmed Words in Sentence 5: ['she', 'ha', 'use', 'allegra', 'also', '.']
Stemmed Words in Sentence 6: ['she', 'use', 'that', 'last', 'summer', 'and', 'she', 'began', 'use', 'it', 'again', 'two', 'week', 'ago', '.']
Stemmed Words in Sentence 7: ['it', 'doe', 'not', 'appear', 'to', 'be', 'work', 'veri', 'well', '.']
Stemmed Words in Sentence 8: ['she', 'ha', 'use', 'over-the-count', 'spray', 'but', 'no', 'prescript', 'nasal', 'spray', '.']
Stemmed 

In [9]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Create a WordNetLemmatizer instance
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to each word in each sentence
lemmatized_words = [[lemmatizer.lemmatize(word) for word in words] for words in tokenized_words]

# Display the lemmatized words for each sentence
for i, words in enumerate(lemmatized_words):
    print(f"Lemmatized Words in Sentence {i + 1}: {words}")


Lemmatized Words in Sentence 1: ['SUBJECTIVE', ':', ',', 'This', '23-year-old', 'white', 'female', 'present', 'with', 'complaint', 'of', 'allergy', '.']
Lemmatized Words in Sentence 2: ['She', 'used', 'to', 'have', 'allergy', 'when', 'she', 'lived', 'in', 'Seattle', 'but', 'she', 'think', 'they', 'are', 'worse', 'here', '.']
Lemmatized Words in Sentence 3: ['In', 'the', 'past', ',', 'she', 'ha', 'tried', 'Claritin', ',', 'and', 'Zyrtec', '.']
Lemmatized Words in Sentence 4: ['Both', 'worked', 'for', 'short', 'time', 'but', 'then', 'seemed', 'to', 'lose', 'effectiveness', '.']
Lemmatized Words in Sentence 5: ['She', 'ha', 'used', 'Allegra', 'also', '.']
Lemmatized Words in Sentence 6: ['She', 'used', 'that', 'last', 'summer', 'and', 'she', 'began', 'using', 'it', 'again', 'two', 'week', 'ago', '.']
Lemmatized Words in Sentence 7: ['It', 'doe', 'not', 'appear', 'to', 'be', 'working', 'very', 'well', '.']
Lemmatized Words in Sentence 8: ['She', 'ha', 'used', 'over-the-counter', 'spray', '

In [10]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Check if the text is not NaN
    if isinstance(text, str):
        # get English stopwords
        english_stopwords = set(stopwords.words('english'))

        # change to lower case and remove punctuation
        text = text.lower().translate(str.maketrans('', '', string.punctuation))

        # divide string into individual words
        tokens = word_tokenize(text)

        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        clean_tokens = []
        for tok in tokens:
            tok = tok.strip()  # remove space
            if tok not in english_stopwords:
                clean_tok = lemmatizer.lemmatize(tok)  # lemmatization
                clean_tok = stemmer.stem(clean_tok)  # Stemming
                clean_tokens.append(clean_tok)
        return " ".join(clean_tokens)
    else:
        return ""  # Return an empty string for NaN values

# Read the dataset
data = pd.read_csv('./mtsamples.csv')

# Display the first few rows of the original DataFrame
print("Original DataFrame:")
print(data.head())

# Assuming the column you want to clean is 'transcription'
data['cleaned_transcription'] = data['transcription'].apply(clean_text)

# Display both the original and cleaned text columns
print("\nDataFrame with Cleaned Text:")
print(data[['transcription', 'cleaned_transcription']])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original DataFrame:
   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   
3           3                             2-D M-Mode. Doppler.     
4           4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...

In [None]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
import nltk

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Check if the text is not NaN
    if isinstance(text, str):
        # get English stopwords
        english_stopwords = set(stopwords.words('english'))

        # change to lower case and remove punctuation
        text = text.lower().translate(str.maketrans('', '', string.punctuation))

        # divide string into individual words
        tokens = word_tokenize(text)

        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        clean_tokens = []
        for tok in tokens:
            tok = tok.strip()  # remove space
            if tok not in english_stopwords:
                clean_tok = lemmatizer.lemmatize(tok)  # lemmatization
                clean_tok = stemmer.stem(clean_tok)  # Stemming
                clean_tokens.append(clean_tok)
        return clean_tokens
    else:
        return []  # Return an empty list for NaN values

# Read the dataset
data = pd.read_csv('./mtsamples.csv')

# Clean the text and tokenize
data['tokenized_transcription'] = data['transcription'].apply(clean_text)

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=data['tokenized_transcription'], vector_size=100, window=5, min_count=1, workers=4)

# Check if the model was trained successfully
if not word2vec_model.wv.key_to_index:
    print("Word2Vec model not trained successfully. Check your data and parameters.")
else:
    # Function to average Word2Vec vectors for a sentence
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = [model.wv[word] for word in words if word in vocabulary]
        return np.mean(feature_vector, axis=0) if feature_vector else np.zeros(num_features)

    # Function to calculate average Word2Vec vectors for a list of sentences
    def get_average_word_vectors(sentences, model, vocabulary, num_features):
        return np.array([average_word_vectors(sentence, model, vocabulary, num_features) for sentence in sentences])

    # Transform the tokenized text to average Word2Vec vectors
    X_w2v = get_average_word_vectors(data['tokenized_transcription'], word2vec_model, word2vec_model.wv.key_to_index, 100)

    # Display the Word2Vec vectors
    print("\nWord2Vec Vectors:")
    print(X_w2v)

# Print additional information about the Word2Vec model
print("\nWord2Vec Model Information:")
print("Vocabulary size:", len(word2vec_model.wv))
print("Training time (seconds):", word2vec_model.total_train_time)


In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Assuming you already have Word2Vec vectors (X_w2v) and structured data (data) from previous steps

# Step 1: Merge datasets based on a common identifier (e.g., sample_name)
merged_data = pd.merge(data, pd.DataFrame(X_w2v), left_index=True, right_index=True)

# Print the columns of the merged dataset
print("Columns in the Merged Dataset:")
print(merged_data.columns)

# Check if 'target_variable' is present in the merged dataset
if 'target_variable' in merged_data.columns:
    # Step 2: Separate features (X) and target variable (y)
    X = merged_data.drop(['target_variable', 'transcription', 'keywords'], axis=1)  # Exclude non-numeric features
    y = merged_data['target_variable']

    # Encode categorical variables if needed (e.g., 'medical_specialty')
    label_encoder = LabelEncoder()
    X['medical_specialty'] = label_encoder.fit_transform(X['medical_specialty'])

    # Step 3: Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 4: Train a model (e.g., RandomForestClassifier) using the combined features
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Step 5: Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Display evaluation metrics
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
else:
    print("'target_variable' not found in the merged dataset.")


In [1]:

import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
import nltk

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Function to clean text and tokenize
def clean_text(text):
    if isinstance(text, str):
        english_stopwords = set(stopwords.words('english'))
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        clean_tokens = [lemmatizer.lemmatize(stemmer.stem(tok)) for tok in tokens if tok.strip() not in english_stopwords]
        return clean_tokens
    else:
        return []

# Read the dataset
data = pd.read_csv('./mtsamples.csv')

# Clean and tokenize the text
data['tokenized_transcription'] = data['transcription'].apply(clean_text)

# Fall-related keywords
fall_keywords = [
    'dizziness', 'balance issues', 'unsteady gait', 'mobility problems', 'history of falls',
    'vertigo', 'trip', 'stumble', 'loss of consciousness', 'slip', 'instability', 'clumsiness',
    'lightheadedness', 'faint', 'impaired coordination', 'weakness', 'fracture', 'injury',
    'elderly fall risk', 'fall prevention', 'medication side effects', 'orthostatic hypotension',
    'vision problems', 'neurological disorders', 'muscle weakness', 'altered mental status',
    'environmental hazards', 'polypharmacy', 'foot problems', 'alcohol use', 'cognitive impairment',
    'postural instability', 'sensory deficits', 'age-related changes', 'osteoporosis', 'fear of falling',
    'poor proprioception', 'poor reflexes', 'frequent stumbling', 'poor depth perception',
    'gait abnormalities', 'parkinsonism', 'muscle atrophy', 'syncope', 'hypotension', 'seizures',
    'diabetic neuropathy', 'medication adjustments', 'poor coordination', 'fearful of falling',
    'slow reaction time', 'abnormal posture', 'musculoskeletal problems', 'shuffling gait',
    'impaired vision', 'fearful gait', 'neurological deficits', 'foot pain', 'foot deformities',
    'urinary incontinence', 'impaired proprioception', 'lack of exercise', 'dehydration',
    'inadequate lighting', 'improper footwear', 'cognitive decline', 'gastrointestinal issues',
    'inadequate nutrition', 'joint pain', 'environmental obstacles', 'difficulty rising from a chair',
    'difficulty with stairs', 'sedentary lifestyle', 'poor health status'
]

# Function to check if any fall-related keyword is present in the text
def has_fall_keywords(tokens):
    return any(keyword in tokens for keyword in fall_keywords)

# Create a new column 'fall_risk' indicating the presence of fall-related keywords
data['fall_risk'] = data['tokenized_transcription'].apply(has_fall_keywords).astype(int)

# Display the dataset with the new 'fall_risk' column
print(data[['transcription', 'fall_risk']])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                          transcription  fall_risk
0     SUBJECTIVE:,  This 23-year-old white female pr...          0
1     PAST MEDICAL HISTORY:, He has difficulty climb...          0
2     HISTORY OF PRESENT ILLNESS: , I have seen ABC ...          0
3     2-D M-MODE: , ,1.  Left atrial enlargement wit...          0
4     1.  The left ventricular cavity size and wall ...          0
...                                                 ...        ...
4994  HISTORY:,  I had the pleasure of meeting and e...          1
4995  ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...          0
4996  SUBJECTIVE: , This is a 42-year-old white fema...          0
4997  CHIEF COMPLAINT: , This 5-year-old male presen...          0
4998  HISTORY: , A 34-year-old male presents today s...          0

[4999 rows x 2 columns]


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib  # Import joblib

# Assuming you have a DataFrame 'data' with features and target variable
# For illustration purposes, we'll create a simple DataFrame
data = pd.DataFrame({
    'feature_1': [1, 2, 3, 4, 5],
    'feature_2': [2, 3, 4, 5, 6],
    'target': [0, 0, 1, 1, 0]
})

# Separate features and target variable
X = data[['feature_1', 'feature_2']]
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a machine learning model (Random Forest for illustration)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Save the trained model to a file using joblib
joblib.dump(model, 'your_model_file.pkl')

# Later, you can load the model using joblib
loaded_model = joblib.load('your_model_file.pkl')

# Now, you can use the loaded_model for making predictions on new data
# For example:
new_data = pd.DataFrame({'feature_1': [6], 'feature_2': [7]})
prediction = loaded_model.predict(new_data)
print(f"Prediction for new data: {prediction}")
