In [4]:
''' This file has our baseline model implementation'''
import pandas as pd
import re
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import json
import sys
import json
from joblib import dump, load

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [47]:
def save_dict(filepath, metrics_dict):
  print("starting writing dictionary to a file")
  with open (filepath, 'w') as fp:
    json.dump(metrics_dict, fp)
  print("done writing dict into .txt file")

In [1]:
def run_baseline(model, x_train, y_train, x_test, y_test, save_path):
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  score = accuracy_score(y_test, y_pred)
  metric_dict = classification_report(y_test, y_pred, output_dict=True)
  save_dict(f'{save_path}_metrics.txt', metric_dict)
  dump(model, f'{save_path}.joblib')
  return metric_dict

#### Filtered

In [4]:
# get X data
with open("[CHANGE].json", "r") as read_file:
    data = json.load(read_file)
    X = np.array(data["array"])

In [6]:
# get Y data
# all mentions of sex taken out
sections_filtered = pd.read_csv('[CHANGE].csv')

ValueError: Expected a 1D array, got an array with shape (331156, 3000)

In [10]:
sections_filtered['tk'] = list(X)
sections_filtered = sections_filtered.dropna() # remove all rows with NAN
indices_kept = sections_filtered.index.tolist()

In [11]:
# check class distribution
print(sections_filtered['1'].value_counts())
sections_filtered = sections_filtered.groupby('1').sample(n=min(sections_filtered['1'].value_counts()), random_state=21) # balance classes
sections_filtered = sections_filtered.sample(frac=1, random_state=21) # shuffle the data

1
F    165681
M    158814
Name: count, dtype: int64


In [12]:
sections_filtered_tune, sections_filtered_test = train_test_split(sections_filtered, test_size=0.2, random_state=21)

In [13]:
sections_filtered_tune = sections_filtered_tune.sample(10000, random_state=21)
sections_filtered_test = sections_filtered_test.sample(5000, random_state=21)

In [1]:
sections_filtered

NameError: name 'sections_filtered' is not defined

In [14]:
sections_filtered_tune.to_csv('./processed_data/sections_filtered_tune_tk.csv', index=False)
sections_filtered_test.to_csv('./processed_data/sections_filtered_test_tk.csv', index=False)

In [15]:
sections_filtered = sections_filtered.sample(100000)

In [16]:
sections_filtered.to_csv('./processed_data/sections_filtered_100000.csv', index=False)

In [None]:
np.savetxt('sections_filtered_test.csv', sections_filtered, delimiter=",")

In [5]:
sections_filtered = pd.read_csv('sections_filtered_test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'sections_filtered_test.csv'

#### CC

In [46]:
with open("../cs229_final_project_data/project_data/cc_numpyData_tfidf_16000.json", "r") as read_file:
    data = json.load(read_file)
    X = np.array(data["array"])

In [28]:
cc = pd.read_csv('/Users/priyankashrestha/Documents/stanford/courses_senior/cs229/cs229_final_project_data/project_data/chief_complaint_processed_filtered.csv')
cc['tk'] = list(X)
cc = cc.dropna() # remove all rows with NAN
indices_kept = cc.index.tolist()

In [29]:
# check class distribution
print(cc['1'].value_counts())
cc = cc.groupby('1').sample(n=min(cc['1'].value_counts()), random_state=21) # balance classes
cc = cc.sample(frac=1, random_state=21) # shuffle the data

1
F    162310
M    156084
Name: count, dtype: int64


In [32]:
cc_tune, cc_test = train_test_split(cc, test_size=0.2, random_state=21)
ccs_tune = cc_tune.sample(10000, random_state=21)
cc_test = cc_test.sample(5000, random_state=21)

In [33]:
cc_tune.to_csv('./processed_data/cc_tune_tk.csv', index=False)
cc_test.to_csv('./processed_data/cc_test_tk.csv', index=False)

In [34]:
cc = cc.sample(100000)
cc.to_csv('./processed_data/cc_100000.csv', index=False)

# Baselines with Gender Filtering

In [2]:
X = sections_filtered['tk']
Y = sections_filtered['1']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=21)

NameError: name 'sections_filtered' is not defined

## Logistic Regression

In [43]:
clf = LogisticRegression()
clf.fit(list(x_train), y_train)
predictions = clf.predict(list(x_test))

# Accuracy Score
score = clf.score(list(x_test), y_test)
print(f"Logistic Regression Accuracy Score: {score}")

metric_dict = classification_report(y_test, predictions, output_dict=True)

metric_dict

Logistic Regression Accuracy Score: 0.76155


{'F': {'precision': 0.7520653218059559,
  'recall': 0.7815713287411401,
  'f1-score': 0.7665344886669604,
  'support': 10017.0},
 'M': {'precision': 0.7718456725755996,
  'recall': 0.7414604828207954,
  'f1-score': 0.7563480304501099,
  'support': 9983.0},
 'accuracy': 0.76155,
 'macro avg': {'precision': 0.7619554971907777,
  'recall': 0.7615159057809677,
  'f1-score': 0.7614412595585351,
  'support': 20000.0},
 'weighted avg': {'precision': 0.7619386838926235,
  'recall': 0.76155,
  'f1-score': 0.7614499180480195,
  'support': 20000.0}}

In [48]:
save_dict('./baseline_experiments/lr_pl_filtered_metrics.txt', metric_dict)

starting writing dictionary to a file
done writing dict into .txt file


In [51]:
dump(clf, 'lr_pl_filtered_metrics.joblib')

['lr_pl_filtered_metrics.joblib']

## SVM

In [54]:
from sklearn import svm

In [None]:
svc = svm.SVC(gamma=0.001, C=100)
metrics = run_baseline(svc, x_train, y_train, x_test, y_test, 'svm_pi_filtered')
metrics

In [30]:
svc.fit(list(x_train), y_train)

In [None]:
y_pred = svc.predict(list(x_test))

In [None]:
# Accuracy Score
score = svc.score(list(x_test), y_test)
print(f"SVC Accuracy Score: {score}")

SVC Accuracy Score: 0.525


In [None]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)

## Decision Trees

In [139]:
from sklearn.tree import DecisionTreeClassifier

In [140]:
dt = DecisionTreeClassifier(criterion = 'gini')

In [141]:
dt.fit(x_train, y_train)

In [142]:
y_pred = dt.predict(x_test)

In [143]:
score = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy Score: {score}")

Decision Tree Accuracy Score: 0.488


In [144]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict

{'F': {'precision': 0.4643249176728869,
  'recall': 0.4410844629822732,
  'f1-score': 0.45240641711229945,
  'support': 959.0},
 'M': {'precision': 0.5078053259871441,
  'recall': 0.5312199807877042,
  'f1-score': 0.5192488262910798,
  'support': 1041.0},
 'accuracy': 0.488,
 'macro avg': {'precision': 0.4860651218300155,
  'recall': 0.4861522218849887,
  'f1-score': 0.48582762170168964,
  'support': 2000.0},
 'weighted avg': {'precision': 0.48695647020045785,
  'recall': 0.488,
  'f1-score': 0.4871978910898546,
  'support': 2000.0}}

## Random Forest Classifier

In [145]:
from sklearn.ensemble import RandomForestClassifier

In [147]:
rc = RandomForestClassifier()

In [148]:
rc.fit(x_train, y_train)

In [149]:
y_pred = rc.predict(x_test)

In [150]:
score = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {score}')

Random Forest Accuracy: 0.5275


In [151]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict 

{'F': {'precision': 0.5084134615384616,
  'recall': 0.4410844629822732,
  'f1-score': 0.4723618090452261,
  'support': 959.0},
 'M': {'precision': 0.541095890410959,
  'recall': 0.6071085494716618,
  'f1-score': 0.5722046174739701,
  'support': 1041.0},
 'accuracy': 0.5275,
 'macro avg': {'precision': 0.5247546759747103,
  'recall': 0.5240965062269676,
  'f1-score': 0.5222832132595981,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5254246657665965,
  'recall': 0.5275,
  'f1-score': 0.5243299908323874,
  'support': 2000.0}}

# Baselines without gender filtering

In [152]:
# get X data
with open("numpyData_tfidf_16000_unfiltered.json", "r") as read_file:
    data = json.load(read_file)
    X = np.array(data["array"])

In [153]:
# get Y data
# all mentions of sex taken out
sections_filtered = pd.read_csv('/Users/priyankashrestha/Documents/stanford/courses_senior/cs229/cs229_final_project/processed_data/sections.csv')
sections_filtered = sections_filtered.dropna() # remove all rows with NAN
Y = sections_filtered.head(10000)['1']

In [154]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

## Logistic Regression

In [155]:
clf = LogisticRegression()
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

In [156]:
# Accuracy Score
score = clf.score(x_test, y_test)
print(f"Logistic Regression Accuracy Score: {score}")

Logistic Regression Accuracy Score: 0.5125


In [157]:
metric_dict = classification_report(y_test, predictions, output_dict=True)

In [158]:
metric_dict

{'F': {'precision': 0.49067599067599066,
  'recall': 0.43899895724713245,
  'f1-score': 0.4634012107870116,
  'support': 959.0},
 'M': {'precision': 0.5288966725043783,
  'recall': 0.5802113352545629,
  'f1-score': 0.5533669262482822,
  'support': 1041.0},
 'accuracy': 0.5125,
 'macro avg': {'precision': 0.5097863315901845,
  'recall': 0.5096051462508477,
  'f1-score': 0.5083840685176468,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5105698555676664,
  'recall': 0.5125,
  'f1-score': 0.510228365684603,
  'support': 2000.0}}

## SVM

In [159]:
svc = svm.SVC(gamma=0.001, C=100)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)

In [160]:
# Accuracy Score
score = svc.score(x_test, y_test)
print(f"SVC Accuracy Score: {score}")

SVC Accuracy Score: 0.5195


In [161]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict

{'F': {'precision': 0.4959677419354839,
  'recall': 0.12825860271115747,
  'f1-score': 0.20381110190555096,
  'support': 959.0},
 'M': {'precision': 0.5228310502283106,
  'recall': 0.8799231508165226,
  'f1-score': 0.6559255281059793,
  'support': 1041.0},
 'accuracy': 0.5195,
 'macro avg': {'precision': 0.5093993960818972,
  'recall': 0.50409087676384,
  'f1-score': 0.4298683150057651,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5099500939019002,
  'recall': 0.5195,
  'f1-score': 0.43913666074287394,
  'support': 2000.0}}

## Random Forest

In [162]:
dt = DecisionTreeClassifier(criterion = 'gini')
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

In [163]:
score = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy Score: {score}")

Decision Tree Accuracy Score: 0.5165


In [164]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict

{'F': {'precision': 0.4957627118644068,
  'recall': 0.4880083420229406,
  'f1-score': 0.49185496584340516,
  'support': 959.0},
 'M': {'precision': 0.5350378787878788,
  'recall': 0.542747358309318,
  'f1-score': 0.5388650453028135,
  'support': 1041.0},
 'accuracy': 0.5165,
 'macro avg': {'precision': 0.5154002953261427,
  'recall': 0.5153778501661292,
  'f1-score': 0.5153600055731093,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5162054362480739,
  'recall': 0.5165,
  'f1-score': 0.5163237122020272,
  'support': 2000.0}}