# Median Baseline

In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import torch
from collections import defaultdict, Counter
import random
import math
import pickle
import string

import wordfreq
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

import src.eval_metric

%matplotlib inline
%load_ext autoreload
%autoreload 2
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [2]:
train_df = pd.read_csv("../data/training_data/train.csv")
valid_df = pd.read_csv("../data/training_data/valid.csv")

In [3]:
output_var_names = ['nFix', 'FFD', 'GPT', 'TRT', 'fixProp']
predict_df = valid_df.copy()
for feat_name in output_var_names:
  predict_df[feat_name] = train_df[feat_name].median()

In [4]:
src.eval_metric.evaluate(predict_df, valid_df)

MAE for nFix: 7.207899377948148
MAE for FFD: 1.1622212627064243
MAE for GPT: 3.5465590928641344
MAE for TRT: 2.7315040522053993
MAE for fixProp: 21.17857366126201
Overall MAE: 7.165351489397224


7.165351489397224

## Simple Feature-based Regression

In [5]:
input_var_names = ['length', 'logfreq', 'has_upper', 'has_punct']
def get_features(token):
  token = token.replace('<EOS>', '')
  return pd.Series({
    'length': len(token),
    'logfreq': wordfreq.zipf_frequency(token, 'en'),
    'has_upper': 0 if token.lower() == token else 1,
    'has_punct': 1 if any(j in string.punctuation for j in token) else 0,
  })

def clip_to_100(val):
  if val < 0:
    return 0
  if val > 100:
    return 100
  return val

In [6]:
train_df[input_var_names] = train_df.word.apply(get_features)

In [7]:
valid_df[input_var_names] = valid_df.word.apply(get_features)

In [11]:
predict_df = valid_df.copy()
for feat_name in output_var_names:
  #model = LinearRegression()
  model = SVR()
  
  model.fit(train_df[input_var_names], train_df[feat_name])
  predict_df[feat_name] = model.predict(predict_df[input_var_names])
  predict_df[feat_name] = predict_df[feat_name].apply(clip_to_100)

In [12]:
src.eval_metric.evaluate(predict_df, valid_df)

MAE for nFix: 4.4395289012500205
MAE for FFD: 0.7225204687081921
MAE for GPT: 2.7281853027584533
MAE for TRT: 1.72790986942936
MAE for fixProp: 12.076806098390312
Overall MAE: 4.338990128107268


4.338990128107268