Fix wide benchmark (#476)

Quantco · Oct 27, 2021 · b6a3391 · b6a3391
1 parent 5b40ae1
commit b6a3391
Show file tree

Hide file tree

Showing 25 changed files with 7,078 additions and 7,220 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,7 @@ Unreleased
 
 - Fixed the sign of the log likelihood of the Gaussian distribution (not used for fitting coefficients).
 - Renamed functions checking for qc.matrix compliance to refer to tabmat.
+- Fixed the wide benchmarks which had duplicated columns (categorical and numerical).
 
 2.0.1 - 2021-10-11
 ------------------

diff --git a/docs/_static/headline_benchmark.pdf b/docs/_static/headline_benchmark.pdf
diff --git a/docs/_static/headline_benchmark.png b/docs/_static/headline_benchmark.png
diff --git a/docs/_static/intermediate-housing-l2.pdf b/docs/_static/intermediate-housing-l2.pdf
diff --git a/docs/_static/intermediate-housing-l2.png b/docs/_static/intermediate-housing-l2.png
diff --git a/docs/_static/intermediate-housing-lasso.pdf b/docs/_static/intermediate-housing-lasso.pdf
diff --git a/docs/_static/intermediate-housing-lasso.png b/docs/_static/intermediate-housing-lasso.png
diff --git a/docs/_static/intermediate-insurance-l2.pdf b/docs/_static/intermediate-insurance-l2.pdf
diff --git a/docs/_static/intermediate-insurance-l2.png b/docs/_static/intermediate-insurance-l2.png
diff --git a/docs/_static/intermediate-insurance-lasso.pdf b/docs/_static/intermediate-insurance-lasso.pdf
diff --git a/docs/_static/intermediate-insurance-lasso.png b/docs/_static/intermediate-insurance-lasso.png
diff --git a/docs/_static/narrow-insurance-l2.pdf b/docs/_static/narrow-insurance-l2.pdf
diff --git a/docs/_static/narrow-insurance-l2.png b/docs/_static/narrow-insurance-l2.png
diff --git a/docs/_static/narrow-insurance-lasso.pdf b/docs/_static/narrow-insurance-lasso.pdf
diff --git a/docs/_static/narrow-insurance-lasso.png b/docs/_static/narrow-insurance-lasso.png
diff --git a/docs/_static/wide-insurance-l2.pdf b/docs/_static/wide-insurance-l2.pdf
diff --git a/docs/_static/wide-insurance-l2.png b/docs/_static/wide-insurance-l2.png
diff --git a/docs/_static/wide-insurance-lasso.pdf b/docs/_static/wide-insurance-lasso.pdf
diff --git a/docs/_static/wide-insurance-lasso.png b/docs/_static/wide-insurance-lasso.png
diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
@@ -1,7 +1,7 @@
 Benchmarks against glmnet and H2O
 =================================
 
-The following benchmarks were run on an Ubuntu 20.04 desktop with a six core Intel i7-4930k processor. 
+The following benchmarks were run on a MacBook Pro laptop with a quad-core Intel Core i5.
 
 The title of each plot refers to both which dataset the benchmark was run on and whether a L2 ridge regression penalty or an L1 lasso penalty was included. For example "Narrow-Insurance-Ridge" was run on the ``narrow-insurance`` dataset with a ridge regression penalty. Each dataset/penalty pair is tested on five distributions that cover most of the common GLM types. The outcome variable is modified appropriately so that the behavior is similar to that expected for the distribution. For example, for the Poisson regression, we predict the number of claims per person. And for the binomial regression, we predict whether any given individual has ever had a claim. For the ``housing`` dataset, we only test three distributions because it does not contain count data that can be used as an outcome.
 

diff --git a/docs/benchmarks/benchmark_data.csv b/docs/benchmarks/benchmark_data.csv
diff --git a/docs/benchmarks/benchmark_figure.py b/docs/benchmarks/benchmark_figure.py
@@ -100,10 +100,8 @@
         )
         plot_df = plot_df.pivot(columns="library_name")
         plot_df.columns = plot_df.columns.get_level_values(1)
+        plot_df = plot_df.sort_index(axis=1).rename(columns={"r-glmnet": "glmnet"})
         plot_df.index = [x.title() for x in plot_df.index]
-        plot_df = plot_df[["h2o", "glum", "r-glmnet"]].rename(
-            columns={"r-glmnet": "glmnet"}
-        )
 
         title = prob_name.title() + "-" + ("Lasso" if reg == "lasso" else "Ridge")
         plot_df.plot.bar(
@@ -170,6 +168,7 @@
         )
         plot_df = plot_df.pivot(columns="library_name")
         plot_df.columns = plot_df.columns.get_level_values(1)
+        plot_df = plot_df.sort_index(axis=1).rename(columns={"r-glmnet": "glmnet"})
         plot_df.index = [x.title() for x in plot_df.index]
 
         title = prob_name.title() + "-" + ("Lasso" if reg == "lasso" else "Ridge")
@@ -237,6 +236,7 @@
 )
 plot_df = plot_df.pivot(columns="library_name")
 plot_df.columns = plot_df.columns.get_level_values(1)
+plot_df = plot_df.sort_index(axis=1).rename(columns={"r-glmnet": "glmnet"})
 plot_df.index = [x.title() for x in plot_df.index]
 
 plot_df.plot.bar(

diff --git a/src/glum_benchmarks/data/create_insurance.py b/src/glum_benchmarks/data/create_insurance.py
@@ -406,7 +406,11 @@ def generate_wide_insurance_dataset(
     transformer = make_column_transformer(
         (
             FunctionTransformer(),
-            lambda x: x.select_dtypes(["number"]).columns,
+            lambda x: [
+                elmt
+                for elmt in x.select_dtypes(["number"]).columns
+                if elmt not in cat_cols
+            ],
         ),
         (
             Pipeline([get_categorizer(col, "cat_" + col) for col in cat_cols]),