Skip to content

Commit

Permalink
Merge pull request #1 from OscarEngelbrektson/DifferencedSyntheticCon…
Browse files Browse the repository at this point in the history
…trol

Added Differenced Synthetic Control
  • Loading branch information
OscarEngelbrektson committed Dec 7, 2020
2 parents b465590 + 0ad7690 commit a98f9fd
Show file tree
Hide file tree
Showing 7 changed files with 399 additions and 246 deletions.
2 changes: 1 addition & 1 deletion SyntheticControlMethods/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# limitations under the License.

from SyntheticControlMethods.__version__ import __version__
from SyntheticControlMethods.main import Synth
from SyntheticControlMethods.main import Synth, DiffSynth
2 changes: 1 addition & 1 deletion SyntheticControlMethods/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

VERSION = (1, 0, 6)
VERSION = (1, 1, 0)

__version__ = '.'.join([str(e) for e in VERSION])
384 changes: 236 additions & 148 deletions SyntheticControlMethods/inferences.py

Large diffs are not rendered by default.

124 changes: 95 additions & 29 deletions SyntheticControlMethods/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@

import pandas as pd
import numpy as np
import copy

from SyntheticControlMethods.plot import Plot
from SyntheticControlMethods.inferences import Inferences

class SynthBase(Inferences, Plot):
class SynthBase(object):

def __init__(self, dataset, outcome_var, id_var, time_var, treatment_period, treated_unit,
covariates, periods_all, periods_pre_treatment, n_controls, n_covariates,
Expand Down Expand Up @@ -110,6 +111,7 @@ def __init__(self, dataset, outcome_var, id_var, time_var, treatment_period, tre
self.v = None
self.treatment_effect = treatment_effect #If known
self.synth_outcome = None
self.synth_constant = None
self.synth_covariates = None
self.min_loss = float("inf")
self.fail_count = 0 #Used to limit number of optimization attempts
Expand All @@ -119,28 +121,13 @@ def __init__(self, dataset, outcome_var, id_var, time_var, treatment_period, tre
self.in_space_placebo_w = None
self.pre_post_rmspe_ratio = None
self.in_time_placebo_outcome = None
self.in_time_placebo_treated_outcome = None
self.in_time_placebo_w = None
self.placebo_treatment_period = None
self.placebo_periods_pre_treatment = None


class Synth(SynthBase):

def __init__(self, dataset, outcome_var, id_var, time_var, treatment_period, treated_unit, **kwargs):

checked_input = self._process_input_data(
dataset, outcome_var, id_var, time_var, treatment_period, treated_unit, **kwargs
)
super(Synth, self).__init__(**checked_input)

#Get synthetic Control
self.optimize(self.treated_outcome, self.treated_covariates,
self.control_outcome, self.control_covariates,
False, 8)

#Visualize synthetic control
self.plot(["original", "pointwise", "cumulative"],
(15, 12), self.treated_unit)
class DataProcessor(object):

def _process_input_data(self, dataset, outcome_var, id_var, time_var, treatment_period, treated_unit, **kwargs):
'''
Expand Down Expand Up @@ -234,28 +221,107 @@ def _process_control_data(self, dataset, outcome_var, id_var, time_var, treatmen
set_index(np.arange(len(control_data[covariates])) // periods_pre_treatment).mean(level=0)).T

return control_outcome_all, control_outcome, control_covariates

class Synth(Inferences, Plot, DataProcessor):

def demean_data(self):
def __init__(self, dataset, outcome_var, id_var, time_var, treatment_period, treated_unit, **kwargs):
self.method = "SC"

original_checked_input = self._process_input_data(
dataset, outcome_var, id_var, time_var, treatment_period, treated_unit, **kwargs
)
self.original_data = SynthBase(**original_checked_input)

#Get synthetic Control
self.optimize(self.original_data.treated_outcome, self.original_data.treated_covariates,
self.original_data.control_outcome, self.original_data.control_covariates,
self.original_data, False, 8)

#Visualize synthetic control
self.plot(["original", "pointwise", "cumulative"],
(15, 12),
self.original_data.treated_unit)

class DiffSynth(Inferences, Plot, DataProcessor):

def __init__(self, dataset, outcome_var, id_var, time_var, treatment_period, treated_unit,
not_diff_cols=None, **kwargs):
self.method = "DSC"

#Process original data - will be used in plotting
original_checked_input = self._process_input_data(
dataset, outcome_var, id_var, time_var, treatment_period, treated_unit, **kwargs
)
self.original_data = SynthBase(**original_checked_input)
self.original_data.dataset.to_csv("original_data.csv", index=False, header=True)

#Process differenced data - will be used in inference
modified_dataset = self.difference_data(dataset, not_diff_cols)
modified_checked_input = self._process_input_data(
modified_dataset, outcome_var, id_var, time_var, treatment_period, treated_unit, **kwargs
)
self.modified_data = SynthBase(**modified_checked_input)
self.original_data.dataset.to_csv("second_original_data.csv", index=False, header=True)

#Get synthetic Control
self.optimize(self.modified_data.treated_outcome, self.modified_data.treated_covariates,
self.modified_data.control_outcome, self.modified_data.control_covariates,
self.modified_data, False, 1)
'''
#Visualize synthetic control
self.plot(["original", "pointwise", "cumulative"],
(15, 12),
self.original_data.treated_unit)
'''

def difference_data(self, dataset, not_diff_cols):
'''
Takes an appropriately formatted, unprocessed dataset
returns dataset with demeaned values computed unitwise for the outcome and all covariates
returns dataset with first-difference values (change from previous time period)
computed unitwise for the outcome and all covariates
Ready to fit a Differenced Synthetic Control
Transformation method - MeanSubtraction:
Subtracting the mean of the corresponding variable and unit from every observation
Transformation method - First Differencing:
Additional processing:
1. Imputes missing values using linear interpolation. (first difference is undefined if two consecutive periods are not present)
'''
mean_subtract_cols = self.dataset.groupby(self.id).apply(lambda x: x - np.mean(x)).drop(columns=[self.time], axis=1)
return pd.concat([data[["ID", "Time"]], mean_subtract_cols], axis=1)
#Make deepcopy of original data as base
modified_dataset = copy.deepcopy(dataset)
data = self.original_data

#Binary flag for whether there are columns to ignore
ignore_some_cols = not_diff_cols == None

#Compute difference of outcome variable
modified_dataset[data.outcome_var] = modified_dataset.groupby(data.id)[data.outcome_var].apply(lambda unit: unit.interpolate(method='linear', limit_direction="both")).diff()

#For covariates
for col in data.covariates:
#Fill in missing values using unitwise linear interpolation
modified_dataset[col] = modified_dataset.groupby(data.id)[col].apply(lambda unit: unit.interpolate(method='linear', limit_direction="both"))

#Compute change from previous period
if ignore_some_cols is not None:
if col not in not_diff_cols:
modified_dataset[col].diff()

#Drop first time period for every unit as the change from the previous period is undefined
modified_dataset.drop(modified_dataset.loc[modified_dataset[data.time]==modified_dataset[data.time].min()].index, inplace=True)
#Return resulting dataframe
return modified_dataset

def difference_data(self):
def demean_data(self):
'''
Takes an appropriately formatted, unprocessed dataset
returns dataset with first-difference values (change from previous time period)
computed unitwise for the outcome and all covariates
returns dataset with demeaned values computed unitwise for the outcome and all covariates
Ready to fit a Differenced Synthetic Control
Transformation method - MeanSubtraction:
Subtracting the mean of the corresponding variable and unit from every observation
'''
first_difference_cols = self.dataset.groupby(self.id).diff().drop(columns=[self.time], axis=1)
return pd.concat([self.dataset[["ID", "Time"]], mean_subtract_cols], axis=1)
raise NotImplementedError

mean_subtract_cols = self.dataset.groupby(self.id).apply(lambda x: x - np.mean(x)).drop(columns=[self.time], axis=1)
return pd.concat([data[["ID", "Time"]], mean_subtract_cols], axis=1)
87 changes: 45 additions & 42 deletions SyntheticControlMethods/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ class Plot(object):
Class responsible for all plotting functionality in package
'''

def plot(self, panels, figsize=(15, 12),
def plot(self,
panels,
figsize=(15, 12),
treated_label="Treated Unit",
synth_label="Synthetic Control",
treatment_label="Treatment",
in_space_exclusion_multiple=5):
in_space_exclusion_multiple=5):
'''
Supported plots:
original:
Expand Down Expand Up @@ -84,16 +86,17 @@ def plot(self, panels, figsize=(15, 12),
Returns:
'''
data = self.original_data

#Extract Synthetic Control
synth = self.synth_outcome
time = self.dataset[self.time].unique()
synth = data.synth_outcome
time = data.dataset[data.time].unique()

plt = self._get_plotter()
fig = plt.figure(figsize=figsize)
valid_panels = ['original', 'pointwise', 'cumulative',
'in-space placebo', 'pre/post rmspe', 'in-time placebo']
solo_panels = ['pre/post rmspe']
solo_panels = ['pre/post rmspe'] #plots with different axes
for panel in panels:
if panel not in valid_panels:
raise ValueError(
Expand All @@ -113,17 +116,17 @@ def plot(self, panels, figsize=(15, 12),
if 'original' in panels:
ax.set_title("{} vs. {}".format(treated_label, synth_label))
ax.plot(time, synth.T, 'r--', label=synth_label)
ax.plot(time ,self.treated_outcome_all, 'b-', label=treated_label)
ax.axvline(self.treatment_period-1, linestyle=':', color="gray")
ax.plot(time ,data.treated_outcome_all, 'b-', label=treated_label)
ax.axvline(data.treatment_period-1, linestyle=':', color="gray")
ax.annotate(treatment_label,
xy=(self.treatment_period-1, self.treated_outcome[-1]*1.2),
xy=(data.treatment_period-1, data.treated_outcome[-1]*1.2),
xytext=(-160, -4),
xycoords='data',
#textcoords="data",
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
ax.set_ylabel(self.outcome_var)
ax.set_xlabel(self.time)
ax.set_ylabel(data.outcome_var)
ax.set_xlabel(data.time)
ax.legend()
if idx != n_panels:
plt.setp(ax.get_xticklabels(), visible=False)
Expand All @@ -133,23 +136,23 @@ def plot(self, panels, figsize=(15, 12),

ax = plt.subplot(n_panels, 1, idx, sharex=ax)
#Subtract outcome of synth from both synth and treated outcome
normalized_treated_outcome = self.treated_outcome_all - synth.T
normalized_synth = np.zeros(self.periods_all)
normalized_treated_outcome = data.treated_outcome_all - synth.T
normalized_synth = np.zeros(data.periods_all)
most_extreme_value = np.max(np.absolute(normalized_treated_outcome))

ax.set_title("Pointwise Effects")
ax.plot(time, normalized_synth, 'r--', label=synth_label)
ax.plot(time ,normalized_treated_outcome, 'b-', label=treated_label)
ax.axvline(self.treatment_period-1, linestyle=':', color="gray")
ax.axvline(data.treatment_period-1, linestyle=':', color="gray")
ax.set_ylim(-1.1*most_extreme_value, 1.1*most_extreme_value)
ax.annotate(treatment_label,
xy=(self.treatment_period-1, 0.5*most_extreme_value),
xy=(data.treatment_period-1, 0.5*most_extreme_value),
xycoords='data',
xytext=(-160, -4),
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
ax.set_ylabel(self.outcome_var)
ax.set_xlabel(self.time)
ax.set_ylabel(data.outcome_var)
ax.set_xlabel(data.time)
ax.legend()
if idx != n_panels:
plt.setp(ax.get_xticklabels(), visible=False)
Expand All @@ -158,23 +161,23 @@ def plot(self, panels, figsize=(15, 12),
if 'cumulative' in panels:
ax = plt.subplot(n_panels, 1, idx, sharex=ax)
#Compute cumulative treatment effect as cumulative sum of pointwise effects
cumulative_effect = np.cumsum(normalized_treated_outcome[self.periods_pre_treatment:])
cummulative_treated_outcome = np.concatenate((np.zeros(self.periods_pre_treatment), cumulative_effect), axis=None)
normalized_synth = np.zeros(self.periods_all)
cumulative_effect = np.cumsum(normalized_treated_outcome[data.periods_pre_treatment:])
cummulative_treated_outcome = np.concatenate((np.zeros(data.periods_pre_treatment), cumulative_effect), axis=None)
normalized_synth = np.zeros(data.periods_all)

ax.set_title("Cumulative Effects")
ax.plot(time, normalized_synth, 'r--', label=synth_label)
ax.plot(time ,cummulative_treated_outcome, 'b-', label=treated_label)
ax.axvline(self.treatment_period-1, linestyle=':', color="gray")
ax.axvline(data.treatment_period-1, linestyle=':', color="gray")
#ax.set_ylim(-1.1*most_extreme_value, 1.1*most_extreme_value)
ax.annotate(treatment_label,
xy=(self.treatment_period-1, cummulative_treated_outcome[-1]*0.3),
xy=(data.treatment_period-1, cummulative_treated_outcome[-1]*0.3),
xycoords='data',
xytext=(-160, -4),
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
ax.set_ylabel(self.outcome_var)
ax.set_xlabel(self.time)
ax.set_ylabel(data.outcome_var)
ax.set_xlabel(data.time)
ax.legend()
if idx != n_panels:
plt.setp(ax.get_xticklabels(), visible=False)
Expand All @@ -184,25 +187,25 @@ def plot(self, panels, figsize=(15, 12),
#assert self.in_space_placebos != None, "Must run in_space_placebo() before you can plot!"

ax = plt.subplot(n_panels, 1, idx)
zero_line = np.zeros(self.periods_all)
normalized_treated_outcome = self.treated_outcome_all - synth.T
zero_line = np.zeros(data.periods_all)
normalized_treated_outcome = data.treated_outcome_all - synth.T

ax.set_title("In-space placebo's")
ax.plot(time, zero_line, 'k--')

#Plot each placebo
ax.plot(time, self.in_space_placebos[0], ('0.7'), label="Placebos")
for i in range(1, self.n_controls):
ax.plot(time, data.in_space_placebos[0], ('0.7'), label="Placebos")
for i in range(1, data.n_controls):

#If the pre rmspe is not more than
#in_space_exclusion_multiple times larger than synth pre rmspe
if in_space_exclusion_multiple is not None:
if self.pre_post_rmspe_ratio["pre_rmspe"][i] < in_space_exclusion_multiple*self.pre_post_rmspe_ratio["pre_rmspe"][0]:
ax.plot(time, self.in_space_placebos[i], ('0.7'))
if data.pre_post_rmspe_ratio["pre_rmspe"][i] < in_space_exclusion_multiple*data.pre_post_rmspe_ratio["pre_rmspe"][0]:
ax.plot(time, data.in_space_placebos[i], ('0.7'))
else:
ax.plot(time, self.in_space_placebos[i], ('0.7'))
ax.plot(time, data.in_space_placebos[i], ('0.7'))

ax.axvline(self.treatment_period-1, linestyle=':', color="gray")
ax.axvline(data.treatment_period-1, linestyle=':', color="gray")
ax.plot(time, normalized_treated_outcome, 'b-', label=treated_label)

#ax.set_ylim(-1.1*most_extreme_value, 1.1*most_extreme_value)
Expand All @@ -214,8 +217,8 @@ def plot(self, panels, figsize=(15, 12),
textcoords='offset points',
arrowprops=dict(arrowstyle="->"))
'''
ax.set_ylabel(self.outcome_var)
ax.set_xlabel(self.time)
ax.set_ylabel(data.outcome_var)
ax.set_xlabel(data.time)
ax.legend()
if idx != n_panels:
plt.setp(ax.get_xticklabels(), visible=False)
Expand All @@ -228,11 +231,11 @@ def plot(self, panels, figsize=(15, 12),

ax.set_title("Pre/post treatment root mean square prediction error")

ax.hist(self.pre_post_rmspe_ratio["post/pre"], bins=int(max(self.pre_post_rmspe_ratio["post/pre"])),
ax.hist(data.pre_post_rmspe_ratio["post/pre"], bins=int(max(data.pre_post_rmspe_ratio["post/pre"])),
color="#3F5D7D", histtype='bar', ec='black')

ax.annotate(self.treated_unit,
xy=(self.pre_post_rmspe_ratio["post/pre"][0]-0.5, 1),
ax.annotate(data.treated_unit,
xy=(data.pre_post_rmspe_ratio["post/pre"][0]-0.5, 1),
xycoords='data',
xytext=(-100, 80),
textcoords='offset points',
Expand All @@ -249,19 +252,19 @@ def plot(self, panels, figsize=(15, 12),
ax = plt.subplot(n_panels, 1, idx)
ax.set_title("In-time placebo: {} vs. {}".format(treated_label, synth_label))

ax.plot(time, self.in_time_placebo_outcome.T, 'r--', label=synth_label)
ax.plot(time ,self.treated_outcome_all, 'b-', label=treated_label)
ax.plot(time, data.in_time_placebo_outcome.T, 'r--', label=synth_label)
ax.plot(time, data.treated_outcome_all, 'b-', label=treated_label)

ax.axvline(self.placebo_treatment_period, linestyle=':', color="gray")
ax.axvline(data.placebo_treatment_period, linestyle=':', color="gray")
ax.annotate('Placebo Treatment',
xy=(self.placebo_treatment_period, self.treated_outcome_all[self.placebo_periods_pre_treatment]*1.2),
xy=(data.placebo_treatment_period, data.treated_outcome_all[data.placebo_periods_pre_treatment]*1.2),
xytext=(-160, -4),
xycoords='data',
textcoords='offset points',

arrowprops=dict(arrowstyle="->"))
ax.set_ylabel(self.outcome_var)
ax.set_xlabel(self.time)
ax.set_ylabel(data.outcome_var)
ax.set_xlabel(data.time)
ax.legend()

if idx != n_panels:
Expand Down

0 comments on commit a98f9fd

Please sign in to comment.