Merge pull request statsmodels#1039 from jseabold/fix-arima-int-name

BUG: Fix ARIMA bugs for small data and data with integer names. Closes statsmodels#1038.
PierreBdR · Aug 14, 2013 · 2d5503a · 2d5503a
2 parents 966e85f + 98afc4d
commit 2d5503a
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 0 deletions.
diff --git a/statsmodels/tsa/arima_model.py b/statsmodels/tsa/arima_model.py
@@ -399,6 +399,13 @@ def _fit_start_params_hr(self, order):
                 armod = AR(endog).fit(ic='bic', trend='nc')
                 arcoefs_tmp = armod.params
                 p_tmp = armod.k_ar
+                # it's possible in small samples that optimal lag-order
+                # doesn't leave enough obs. No consistent way to fix.
+                if p_tmp + q >= len(endog):
+                    raise ValueError("Proper starting parameters cannot"
+                            " be found for this order with this number "
+                            "of observations. Use the start_params "
+                            "argument.")
                 resid = endog[p_tmp:] - np.dot(lagmat(endog, p_tmp,
                                 trim='both'), arcoefs_tmp)
                 if p < p_tmp + q:

diff --git a/statsmodels/tsa/tests/test_arima.py b/statsmodels/tsa/tests/test_arima.py
@@ -1796,6 +1796,32 @@ def test_bad_start_params():
     arima_mod = ARIMA(np.log(inv), (1,1,2))
     assert_raises(ValueError, mod.fit)
 
+def test_arima_small_data_bug():
+    # Issue 1038, too few observations with given order
+    from datetime import datetime
+    import statsmodels.api as sm
+
+    vals = [96.2, 98.3, 99.1, 95.5, 94.0, 87.1, 87.9, 86.7402777504474]
+
+    dr = dates_from_range("1990q1", length=len(vals))
+    ts = pandas.TimeSeries(vals, index=dr)
+    df = pandas.DataFrame(ts)
+    mod = sm.tsa.ARIMA(df, (2, 0, 2))
+    assert_raises(ValueError, mod.fit)
+
+def test_arima_dataframe_integer_name():
+    # Smoke Test for Issue 1038
+    from datetime import datetime
+    import statsmodels.api as sm
+
+    vals = [96.2, 98.3, 99.1, 95.5, 94.0, 87.1, 87.9, 86.7402777504474,
+            94.0, 96.5, 93.3, 97.5, 96.3, 92.]
+
+    dr = dates_from_range("1990q1", length=len(vals))
+    ts = pandas.TimeSeries(vals, index=dr)
+    df = pandas.DataFrame(ts)
+    mod = sm.tsa.ARIMA(df, (2, 0, 2))
+
 if __name__ == "__main__":
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb'], exit=False)
diff --git a/statsmodels/tsa/vector_ar/util.py b/statsmodels/tsa/vector_ar/util.py
@@ -60,6 +60,8 @@ def make_lag_names(names, lag_order, trendorder=1):
     # take care of lagged endogenous names
     for i in range(1, lag_order + 1):
         for name in names:
+            if not isinstance(name, basestring):
+                name = str(name) # will need consistent unicode handling
             lag_names.append('L'+str(i)+'.'+name)
 
     # handle the constant name