# Pystata example:
### Using python to call Stata to run regression(s) and report results

In [1]:
## 1. generate random sample of data
import numpy as np
import pandas as pd

# one dependent variables with four indenpendent variables
data1 = pd.DataFrame(np.random.rand(1000,5), columns = ['y','x1','x2','x3','x4'])
# some potential fixed effects or clusters
data1['fx1'] = np.random.randint(10, size=1000)
data1['fx2'] = np.random.randint(5, size=1000)
data1['fx3'] = np.random.randint(3, size=1000)

# generate another data
data2 = pd.DataFrame(np.random.rand(10000,5), columns = ['y','x1','x2','x3','x4'])
# add some modifications
data2['x1'] = data2['x1']*15
data2['x2'] = data2['x2'] + 1
data2['x3'] = np.log(data2['x3']** + 1)


# some potential fixed effects or clusters
data2['fx1'] = np.random.randint(20, size=10000)
data2['fx2'] = np.random.randint(15, size=10000)
data2['fx3'] = np.random.randint(13, size=10000)


### Dataset 1

In [2]:
data1.head()

Unnamed: 0,y,x1,x2,x3,x4,fx1,fx2,fx3
0,0.313594,0.30376,0.841619,0.576463,0.409276,4,3,0
1,0.02398,0.254685,0.36305,0.401601,0.092643,9,3,1
2,0.178209,0.773265,0.336436,0.125324,0.994106,2,0,2
3,0.774804,0.223002,0.983858,0.549169,0.571693,8,4,1
4,0.760475,0.508586,0.0134,0.72027,0.364342,3,4,2


### Dataset 2

In [3]:
data2.head()

Unnamed: 0,y,x1,x2,x3,x4,fx1,fx2,fx3
0,0.268163,2.15008,1.790252,-0.716516,0.889858,8,14,12
1,0.854698,5.406692,1.361118,-1.034033,0.785388,18,10,6
2,0.025087,12.510336,1.1562,-2.269767,0.894517,16,6,8
3,0.161995,5.444088,1.02035,-0.424356,0.698452,12,9,12
4,0.542729,5.060802,1.333644,-2.600836,0.059506,15,0,9


In [4]:
from src.pystata import summary_col
# some random combinations of fixed effects
fx_1 = {'Stock fixed effects': 'fx1','Year fixed effects': 'fx2'}
fx_2 = {'Stock fixed effects': 'fx1', 'Industry Fixed effects': 'fx3'}
fx_3 = {'Stock fixed effects': 'fx1','Year fixed effects': 'fx2', 'Industry Fixed effects': 'fx3'}

# Syntax: [data, regression specification, covariance type (enter cluster list),fixed effects]
reg_inputs = [[data1, 'y  ~ 1   + x1+ x2', 'robust', fx_1],
              [data1, 'y  ~ 1   + x1+ x2 ', 'robust', fx_1],
              [data2, 'y  ~ 1   + x1+ x2 + x3 + x4', 'robust',fx_2],
              [data2, 'y  ~ 1   + x1+ x2 + x3 ', 'robust',fx_2],
              [data1, 'y  ~ 1   + x1+ x2 + x4', ['fx1','fx2'],fx_2],
              [data2, 'y  ~ 1   + x1+ x2 + x3 + x4', ['fx1','fx2'],fx_3]
             ]

In [5]:
outputDir = '/home/shinc/Desktop/example/test/' # set the directory to save Stata output (log and results)
table = summary_col(reg_inputs) # read regression specification
table.set_dir(outputDir) # set the directory to save Stata output (log and results)
table.name = 'table_pystata' # set the name of the table
table.modelname = [ "Y1", "Y1","Var","Variable","Model name","Y",] # set the name for columns
table.order = ['x1', 'x2' , 'x3', 'x4'] # Determine independent variables order
table._main_() # transit data from python to Stata and write Stata do file accordingly
table.run_do() # run Stata do file

### Output 
1. the temporary data in input folder will be wiped out  (default option)
2. the log folder constains Stata log
3. the output folder constains 1. stata do script and 2. regression output (default format is tex)

In [6]:
! tree /home/shinc/Desktop/example/test/

[01;34m/home/shinc/Desktop/example/test/[00m
├── [01;34minput[00m
├── [01;34mlog[00m
│   └── table_pystata.log
└── [01;34moutput[00m
    ├── table_pystata.do
    └── table_pystata.tex

3 directories, 3 files


### Defaults output format is tex, however, it is hard to  show latex table in jupyter. Here, I provide a customized print function

In [7]:
table.print()

Unnamed: 0,Var,Y1 (1),Y1 (2),Var (3),Variable (4),Model name (5),Y (6)
0,x1,0.0380,0.0380,-0.000306,-0.000307,0.0380,-0.000275
1,,(1.20),(1.20),(-0.46),(-0.46),(1.05),(-0.40)
2,x2,-0.0302,-0.0302,0.00399,0.00407,-0.0286,0.00395
3,,(-0.96),(-0.96),(0.40),(0.41),(-0.92),(0.42)
4,x3,,,0.00230,0.00232,,0.00221
5,,,,(0.79),(0.79),,(0.76)
6,x4,,,-0.00782,,-0.00373,-0.00767
7,,,,(-0.79),,(-0.13),(-0.78)
8,---,---,---,---,---,---,---
0,Stock fixed effects,Yes,Yes,Yes,Yes,Yes,Yes


### Or simply save as html format (easy to print out in jupyter)

In [8]:
outputDir = '/home/shinc/Desktop/example/data/'
table = summary_col(reg_inputs)
table.set_dir(outputDir)
table.name = 'table_pystata'
table.modelname = [ "Y1", "Y1","Var","Variable","Model name","Y",]
table.order = ['x1', 'x2' , 'x3', 'x4']
table.outtype = 'html'
table._main_()
table.run_do()

In [9]:
table._readhtml_()

0,1,2,3,4,5,6
,,,,,,
,(1),(2),(3),(4),(5),(6)
,Y1,Y1,Var,Variable,Model name,Y
,,,,,,
x1,0.0380,0.0380,-0.000306,-0.000307,0.0380,-0.000275
,(1.20),(1.20),(-0.46),(-0.46),(1.05),(-0.40)
,,,,,,
x2,-0.0302,-0.0302,0.00399,0.00407,-0.0286,0.00395
,(-0.96),(-0.96),(0.40),(0.41),(-0.92),(0.42)
,,,,,,
