Merge pull request #94 from nickmao1994/main

Imporve plots, tables and final reports
UBC-MDS · Dec 11, 2021 · 99c47fc · 99c47fc
2 parents f3dfa90 + 9af1d41
commit 99c47fc
Show file tree

Hide file tree

Showing 7 changed files with 461 additions and 142 deletions.
diff --git a/docs/Project_report_milestone2.ipynb b/docs/Project_report_milestone2.ipynb
diff --git a/models/coeff.png b/models/coeff.png
diff --git a/results/model/coeff_bar.png b/results/model/coeff_bar.png
diff --git a/results/model/cv_result.png b/results/model/cv_result.png
diff --git a/src/models/model_building.ipynb b/src/models/model_building.ipynb
diff --git a/src/models/test.py b/src/models/test.py
@@ -20,6 +20,10 @@
 from docopt import docopt
 import matplotlib.pyplot as plt
 import mglearn
+from mglearn.plot_2d_separator import (plot_2d_separator, plot_2d_classification,
+                                plot_2d_scores)
+from mglearn.plot_helpers import cm2 as cm, discrete_scatter
+
 import numpy as np
 import pandas as pd
 import pickle
@@ -100,15 +104,63 @@ def coeff_plot(best_model, out_dir):
     """
     logger.info("Drawing bar plot for coefficents...")
     feature_names = np.array(best_model[:-1].get_feature_names_out())
+    name = []
+    for n in feature_names.tolist():
+        name.append(n.split('__')[1])
     coeffs = best_model.named_steps["logisticregression"].coef_.flatten()
-    coeff_df = pd.DataFrame(coeffs, index=feature_names, columns=["Coefficient"])
+    coeff_df = pd.DataFrame(coeffs, index=name, columns=["Coefficient"])
     coeff_df_sorted = coeff_df.sort_values(by="Coefficient", ascending=False)
     coeff_df_sorted.to_html(os.path.join(out_dir, "coeff_sorted.html"), escape=False)
-    mglearn.tools.visualize_coefficients(coeffs, feature_names, n_top_features=5)
-    plt.savefig(os.path.join(out_dir, "coeff_bar.png"))
+    visualize_coefficients(coeffs, feature_names, n_top_features=5)
+    plt.savefig(os.path.join(out_dir, "coeff_bar.png"), bbox_inches = 'tight')
     logger.info("Bar plot for coefficents saved")
 
-
+
+def visualize_coefficients(coefficients, feature_names, n_top_features=25):
+    """Visualize coefficients of a linear model.
+    Parameters
+    ----------
+    coefficients : nd-array, shape (n_features,)
+        Model coefficients.
+    feature_names : list or nd-array of strings, shape (n_features,)
+        Feature names for labeling the coefficients.
+    n_top_features : int, default=25
+        How many features to show. The function will show the largest (most
+        positive) and smallest (most negative)  n_top_features coefficients,
+        for a total of 2 * n_top_features coefficients.
+    """
+    coefficients = coefficients.squeeze()
+    if coefficients.ndim > 1:
+        # this is not a row or column vector
+        raise ValueError("coeffients must be 1d array or column vector, got"
+                         " shape {}".format(coefficients.shape))
+    coefficients = coefficients.ravel()
+
+    if len(coefficients) != len(feature_names):
+        raise ValueError("Number of coefficients {} doesn't match number of"
+                         "feature names {}.".format(len(coefficients),
+                                                    len(feature_names)))
+    # get coefficients with large absolute values
+    coef = coefficients.ravel()
+    positive_coefficients = np.argsort(coef)[-n_top_features:]
+    negative_coefficients = np.argsort(coef)[:n_top_features]
+    interesting_coefficients = np.hstack([negative_coefficients,
+                                          positive_coefficients])
+    # plot them
+    plt.figure(figsize=(15, 5))
+    colors = [cm(1) if c < 0 else cm(0)
+              for c in coef[interesting_coefficients]]
+    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients],
+            color=colors)
+    feature_names = np.array(feature_names)
+    plt.subplots_adjust(bottom=0.3)
+    plt.xticks(np.arange(0, 2 * n_top_features),
+               feature_names[interesting_coefficients], rotation=60,
+               ha="right")
+    plt.ylabel("Coefficient magnitude")
+    plt.xlabel("Feature")
+
+
 if __name__ == "__main__":
 
     # Parse command line parameters

diff --git a/src/models/train.py b/src/models/train.py
@@ -179,6 +179,10 @@ def train_plot(train_results, out_dir):
     """
     logger.info("Making train results plot...")
     train_results.plot(x="param_logisticregression__C", y="mean_test_score")
+    plt.plot(100, 0.826403, marker="o", markersize=10, markeredgecolor="red", markerfacecolor="red")
+    plt.xlabel("Hyperparameter of logistic regression C")
+    plt.ylabel("Mean test score")
+    plt.legend(["Mean test score", "Best estimator"])
     plt.xscale("log")
     plt.savefig(out_dir)
     logger.info(f"Train results plot saved to {out_dir}")