In [109]:
''' This file has our baseline model implementation'''
import pandas as pd
import re
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import json

# Baselines with Gender Filtering

In [110]:
# get X data
with open("numpyData_tfidf_16000.json", "r") as read_file:
    data = json.load(read_file)
    X = np.array(data["array"])

In [119]:
# get Y data
# all mentions of sex taken out
sections_filtered = pd.read_csv('/Users/priyankashrestha/Documents/stanford/courses_senior/cs229/cs229_final_project/processed_data/sections_processed_filtered.csv')
sections_filtered = sections_filtered.dropna() # remove all rows with NAN
Y = sections_filtered.head(10000)['1']

In [125]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

## Logistic Regression

In [126]:
clf = LogisticRegression()

In [127]:
clf.fit(x_train, y_train)

In [128]:
predictions = clf.predict(x_test)

In [129]:
# Accuracy Score
score = clf.score(x_test, y_test)
print(f"Logistic Regression Accuracy Score: {score}")

Logistic Regression Accuracy Score: 0.5295


In [130]:
metric_dict = classification_report(y_test, predictions, output_dict=True)

In [131]:
metric_dict

{'F': {'precision': 0.5105633802816901,
  'recall': 0.45359749739311783,
  'f1-score': 0.4803975704030922,
  'support': 959.0},
 'M': {'precision': 0.5435540069686411,
  'recall': 0.5994236311239193,
  'f1-score': 0.5701233439926907,
  'support': 1041.0},
 'accuracy': 0.5295,
 'macro avg': {'precision': 0.5270586936251656,
  'recall': 0.5265105642585186,
  'f1-score': 0.5252604571978915,
  'support': 2000.0},
 'weighted avg': {'precision': 0.527735001472248,
  'recall': 0.5295,
  'f1-score': 0.5270998355564782,
  'support': 2000.0}}

## SVM

In [132]:
from sklearn import svm

In [133]:
svc = svm.SVC(gamma=0.001, C=100)

In [134]:
svc.fit(x_train, y_train)

In [135]:
y_pred = svc.predict(x_test)

In [136]:
# Accuracy Score
score = svc.score(x_test, y_test)
print(f"SVC Accuracy Score: {score}")

SVC Accuracy Score: 0.525


In [137]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)

In [138]:
metric_dict

{'F': {'precision': 0.5146579804560261,
  'recall': 0.16475495307612095,
  'f1-score': 0.24960505529225907,
  'support': 959.0},
 'M': {'precision': 0.5268753691671589,
  'recall': 0.8568683957732949,
  'f1-score': 0.6525237746891002,
  'support': 1041.0},
 'accuracy': 0.525,
 'macro avg': {'precision': 0.5207666748115924,
  'recall': 0.5108116744247079,
  'f1-score': 0.4510644149906796,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5210171312801707,
  'recall': 0.525,
  'f1-score': 0.4593242487383149,
  'support': 2000.0}}

## Decision Trees

In [139]:
from sklearn.tree import DecisionTreeClassifier

In [140]:
dt = DecisionTreeClassifier(criterion = 'gini')

In [141]:
dt.fit(x_train, y_train)

In [142]:
y_pred = dt.predict(x_test)

In [143]:
score = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy Score: {score}")

Decision Tree Accuracy Score: 0.488


In [144]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict

{'F': {'precision': 0.4643249176728869,
  'recall': 0.4410844629822732,
  'f1-score': 0.45240641711229945,
  'support': 959.0},
 'M': {'precision': 0.5078053259871441,
  'recall': 0.5312199807877042,
  'f1-score': 0.5192488262910798,
  'support': 1041.0},
 'accuracy': 0.488,
 'macro avg': {'precision': 0.4860651218300155,
  'recall': 0.4861522218849887,
  'f1-score': 0.48582762170168964,
  'support': 2000.0},
 'weighted avg': {'precision': 0.48695647020045785,
  'recall': 0.488,
  'f1-score': 0.4871978910898546,
  'support': 2000.0}}

## Random Forest Classifier

In [145]:
from sklearn.ensemble import RandomForestClassifier

In [147]:
rc = RandomForestClassifier()

In [148]:
rc.fit(x_train, y_train)

In [149]:
y_pred = rc.predict(x_test)

In [150]:
score = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {score}')

Random Forest Accuracy: 0.5275


In [151]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict 

{'F': {'precision': 0.5084134615384616,
  'recall': 0.4410844629822732,
  'f1-score': 0.4723618090452261,
  'support': 959.0},
 'M': {'precision': 0.541095890410959,
  'recall': 0.6071085494716618,
  'f1-score': 0.5722046174739701,
  'support': 1041.0},
 'accuracy': 0.5275,
 'macro avg': {'precision': 0.5247546759747103,
  'recall': 0.5240965062269676,
  'f1-score': 0.5222832132595981,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5254246657665965,
  'recall': 0.5275,
  'f1-score': 0.5243299908323874,
  'support': 2000.0}}

# Baselines without gender filtering

In [152]:
# get X data
with open("numpyData_tfidf_16000_unfiltered.json", "r") as read_file:
    data = json.load(read_file)
    X = np.array(data["array"])

In [153]:
# get Y data
# all mentions of sex taken out
sections_filtered = pd.read_csv('/Users/priyankashrestha/Documents/stanford/courses_senior/cs229/cs229_final_project/processed_data/sections.csv')
sections_filtered = sections_filtered.dropna() # remove all rows with NAN
Y = sections_filtered.head(10000)['1']

In [154]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

## Logistic Regression

In [155]:
clf = LogisticRegression()
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

In [156]:
# Accuracy Score
score = clf.score(x_test, y_test)
print(f"Logistic Regression Accuracy Score: {score}")

Logistic Regression Accuracy Score: 0.5125


In [157]:
metric_dict = classification_report(y_test, predictions, output_dict=True)

In [158]:
metric_dict

{'F': {'precision': 0.49067599067599066,
  'recall': 0.43899895724713245,
  'f1-score': 0.4634012107870116,
  'support': 959.0},
 'M': {'precision': 0.5288966725043783,
  'recall': 0.5802113352545629,
  'f1-score': 0.5533669262482822,
  'support': 1041.0},
 'accuracy': 0.5125,
 'macro avg': {'precision': 0.5097863315901845,
  'recall': 0.5096051462508477,
  'f1-score': 0.5083840685176468,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5105698555676664,
  'recall': 0.5125,
  'f1-score': 0.510228365684603,
  'support': 2000.0}}

## SVM

In [159]:
svc = svm.SVC(gamma=0.001, C=100)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)

In [160]:
# Accuracy Score
score = svc.score(x_test, y_test)
print(f"SVC Accuracy Score: {score}")

SVC Accuracy Score: 0.5195


In [161]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict

{'F': {'precision': 0.4959677419354839,
  'recall': 0.12825860271115747,
  'f1-score': 0.20381110190555096,
  'support': 959.0},
 'M': {'precision': 0.5228310502283106,
  'recall': 0.8799231508165226,
  'f1-score': 0.6559255281059793,
  'support': 1041.0},
 'accuracy': 0.5195,
 'macro avg': {'precision': 0.5093993960818972,
  'recall': 0.50409087676384,
  'f1-score': 0.4298683150057651,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5099500939019002,
  'recall': 0.5195,
  'f1-score': 0.43913666074287394,
  'support': 2000.0}}

## Random Forest

In [162]:
dt = DecisionTreeClassifier(criterion = 'gini')
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

In [163]:
score = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy Score: {score}")

Decision Tree Accuracy Score: 0.5165


In [164]:
metric_dict = classification_report(y_test, y_pred, output_dict=True)
metric_dict

{'F': {'precision': 0.4957627118644068,
  'recall': 0.4880083420229406,
  'f1-score': 0.49185496584340516,
  'support': 959.0},
 'M': {'precision': 0.5350378787878788,
  'recall': 0.542747358309318,
  'f1-score': 0.5388650453028135,
  'support': 1041.0},
 'accuracy': 0.5165,
 'macro avg': {'precision': 0.5154002953261427,
  'recall': 0.5153778501661292,
  'f1-score': 0.5153600055731093,
  'support': 2000.0},
 'weighted avg': {'precision': 0.5162054362480739,
  'recall': 0.5165,
  'f1-score': 0.5163237122020272,
  'support': 2000.0}}