In [1]:
"""
This notebook performs the bag of words approach to distinguish between female and male classes.
"""

import numpy as np
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tqdm
import json
from json import JSONEncoder

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyankashrestha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyankashrestha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/priyankashrestha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/priyankashrestha/nltk_data...
[nltk_data]   Package omw-1.4 is alre

True

In [2]:
class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

# tokenizing (with stemming and lemmatizing)
def custom_tokenizer(text):
    # tokenize text by replacing punctuation and numbers with spaces and lowercase all words
    punc_list = string.punctuation + '0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    words = word_tokenize(text)
    # filter out short words
    words = [word for word in words if len(word) > 1]
    # stem words
    porter_stemmer = PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    # lemmatize words
    lemmer = WordNetLemmatizer()
    words = [lemmer.lemmatize(word) for word in words]
    return words

## Filtered Data

In [3]:
# PART 1: import files and separate male/female
# separating male and female notes
df = pd.read_csv('./processed_data/sections_processed_filtered.csv')
df = df.head(10000) 
df_f = df[df['1'] == 'F']  # female notes
df_m = df[df['1'] == 'M']  # male notes

# converting section object to list
section_list = df['0'].values.tolist()
section_list = [str(item) for item in section_list]


In [6]:
# custom stop words
my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
                 'is','patient','s','he','at','as','or','one','she','his','her','am',
                 'were','you','pt','pm','by','be','had','your','this','date',
                 'from','there','an','that','p','are','have','has','h','but','o',
                 'namepattern','which','every','also']
standard_stop = stopwords.words("english")
total_stop_words = list(set(my_stop_words + standard_stop))

In [10]:
# Serialization
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=3000, stop_words=my_stop_words)
tfidf.fit(section_list)
Y = tfidf.transform(section_list)
tfidf_names = list(tfidf.get_feature_names_out())

with open("tfidf_labels_16000.txt", "w") as txt_file:
    for line in tfidf_names:
        txt_file.write(" ".join(line) + "\n")
numpyArray = Y.toarray()



In [13]:
# Serialization
numpyData = {"array": numpyArray}
encodedNumpyData2 = json.dumps(numpyData, cls=NumpyArrayEncoder)  # use dump() to write array into file
print("serialize NumPy array into JSON and write into a file")
with open("numpyData_tfidf_16000.json", "w") as write_file:
    json.dump(numpyData, write_file, cls=NumpyArrayEncoder)
print("Done writing serialized NumPy array into file")

serialize NumPy array into JSON and write into a file
Done writing serialized NumPy array into file


In [14]:
# And make a dataframe out of it
results = pd.DataFrame(Y.toarray(), columns=tfidf.get_feature_names_out())
results.to_json(r'tfidf.json')

## Nonfiltered Data

In [16]:
# PART 1: import files and separate male/female
# separating male and female notes
df = pd.read_csv('./processed_data/sections.csv')
df = df.head(10000) 
df_f = df[df['1'] == 'F']  # female notes
df_m = df[df['1'] == 'M']  # male notes

# converting section object to list
section_list = df['0'].values.tolist()
section_list = [str(item) for item in section_list]

In [17]:
# Serialization
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=3000, stop_words=my_stop_words)
tfidf.fit(section_list)
Y = tfidf.transform(section_list)
tfidf_names = list(tfidf.get_feature_names_out())

with open("tfidf_labels_16000_unfiltered.txt", "w") as txt_file:
    for line in tfidf_names:
        txt_file.write(" ".join(line) + "\n")
numpyArray = Y.toarray()



In [18]:
# Serialization
numpyData = {"array": numpyArray}
encodedNumpyData2 = json.dumps(numpyData, cls=NumpyArrayEncoder)  # use dump() to write array into file
print("serialize NumPy array into JSON and write into a file")
with open("numpyData_tfidf_16000_unfiltered.json", "w") as write_file:
    json.dump(numpyData, write_file, cls=NumpyArrayEncoder)
print("Done writing serialized NumPy array into file")

serialize NumPy array into JSON and write into a file
Done writing serialized NumPy array into file


In [None]:
# Serialization
numpyData = {"array": numpyArray}
encodedNumpyData2 = json.dumps(numpyData, cls=NumpyArrayEncoder)  # use dump() to write array into file
print("serialize NumPy array into JSON and write into a file")
with open("numpyData_tfidf_16000_unfiltered.json", "w") as write_file:
    json.dump(numpyData, write_file, cls=NumpyArrayEncoder)
print("Done writing serialized NumPy array into file")