In [None]:
#As always, we import everything
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

### 1. A naive analysis

In [None]:
data = pd.read_csv('lalonde.csv')
data

In [None]:
data_non_treatment = data[data.treat == 0][['re78']].values
data_treatment = data[data.treat == 1][['re78']].values

non_treatment_count = len(data_non_treatment)
treatment_count = len(data_treatment)

In [None]:
plt.hist(data_non_treatment, range=[0, 40000])
plt.title("Histogram of revenue for non-treatment")
plt.xlabel("Revenue in 1978")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.hist(data_treatment, range=[0, 40000])
plt.title("Histogram of revenue for treatment")
plt.xlabel("Revenue in 1978")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.boxplot([data_non_treatment, data_treatment])
plt.xticks([1, 2], ['non-treatment', 'treatment'])
plt.show()

As we can see from those basic histograms and the box plot, the 'treatment' population has a lower income (except for a few outliers).
A naïve researcher could conclude that the treatment is not only inneficient, but also diminishes the potential income of the participant.

### 2. A closer look at the data

Now, let's look at each variable to see if there is differences between the two groups.

In [None]:
age_non_treatment = data[data.treat == 0][['age']].values
age_treatment = data[data.treat == 1][['age']].values

plt.boxplot([age_non_treatment, age_treatment])
plt.xticks([1, 2], ['non-treatment', 'treatment'])
plt.show()

First, we can see that the treated group is generally younger.

In [None]:
educ_non_treatment = data[data.treat == 0][['educ']].values
educ_treatment = data[data.treat == 1][['educ']].values

plt.boxplot([educ_non_treatment, educ_treatment])
plt.xticks([1, 2], ['non-treatment', 'treatment'])
plt.show()

For the education, the two groups are very similar, except for a few outliers.

In [None]:
black_non_treatment = len(data[(data.treat == 0) & (data.black == 1)])
hispan_non_treatment = len(data[(data.treat == 0) & (data.hispan == 1)])
white_non_treatment = non_treatment_count - black_non_treatment - hispan_non_treatment

black_treatment = len(data[(data.treat == 1) & (data.black == 1)])
hispan_treatment = len(data[(data.treat == 1) & (data.hispan == 1)])
white_treatment = treatment_count - black_treatment - hispan_treatment

In [None]:
width = 1/4 
colors = ['#b2182b','#d6604d','#f0b572']
fig, ax = plt.subplots(figsize=(20,7))
pos = list(range(2))

plt.bar(pos, [black_non_treatment / non_treatment_count, black_treatment / treatment_count], width, color=colors[0])
plt.bar([p + width for p in pos], [hispan_non_treatment / non_treatment_count, hispan_treatment / treatment_count], width, color=colors[1])
plt.bar([p + 2 * width for p in pos], [white_non_treatment / non_treatment_count, white_treatment / treatment_count], width, color=colors[2])

ax.set_ylabel('Race (%)')

ax.set_title('Race of participants')

ax.set_xticks([p + width for p in pos])

plt.legend(['Black', 'Hispanic', 'White'], loc='upper left')

ax.set_xticklabels(['Non-treatment', 'Treatment'])

plt.grid(axis='y')
plt.show()

We can see here that there is a huge difference in terms of races in the two groups. The majority of the non-treatment is white, whereas the overwhelming majority of the treatment group is black. This can influence the study as the race in the US has a significant correlation with the socio-economic conditions of the person.

In [None]:
married_non_treatment = len(data[(data.treat == 0) & (data.married == 1)])
not_married_non_treatment = non_treatment_count - married_non_treatment

married_treatment = len(data[(data.treat == 1) & (data.married == 1)])
not_married_treatment = treatment_count - married_treatment

In [None]:
width = 1/3 
colors = ['#b2182b','#d6604d']
fig, ax = plt.subplots(figsize=(20,7))
pos = list(range(2))

plt.bar(pos, [married_non_treatment / non_treatment_count, married_treatment / treatment_count], width, color=colors[0])
plt.bar([p + width for p in pos], [not_married_non_treatment / non_treatment_count, not_married_treatment / treatment_count], width, color=colors[1])

ax.set_ylabel('Married (%)')

ax.set_title('Marital status of participants')

ax.set_xticks([p + width/2 for p in pos])

plt.legend(['Married', 'Not Married'], loc='upper left')

ax.set_xticklabels(['Non-treatment', 'Treatment'])

plt.grid(axis='y')
plt.show()

There is a much bigger share of the treated group which is not married, which can partly be explained by the fact that the treated group is younger.

In [None]:
degree_non_treatment = len(data[(data.treat == 0) & (data.nodegree == 0)])
nodegree_non_treatment = non_treatment_count - degree_non_treatment

degree_treatment = len(data[(data.treat == 1) & (data.nodegree == 0)])
nodegree_treatment = treatment_count - degree_treatment

In [None]:
width = 1/3 
colors = ['#b2182b','#d6604d']
fig, ax = plt.subplots(figsize=(20,7))
pos = list(range(2))

plt.bar(pos, [degree_non_treatment / non_treatment_count, degree_treatment / treatment_count], width, color=colors[0])
plt.bar([p + width for p in pos], [nodegree_non_treatment / non_treatment_count, nodegree_treatment / treatment_count], width, color=colors[1])

ax.set_ylabel('Degree (%)')

ax.set_title('Share of participants who have a degree')

ax.set_xticks([p + width/2 for p in pos])

plt.legend(['Degree', 'No Degree'], loc='upper left')

ax.set_xticklabels(['Non-treatment', 'Treatment'])

plt.grid(axis='y')
plt.show()

There is a small difference between the two groups. There is 10% more people in the treatment group who don't have a degree.

In [None]:
re74_non_treatment = data[data.treat == 0][['re74']].values
re74_treatment = data[data.treat == 1][['re74']].values

plt.boxplot([re74_non_treatment, re74_treatment])
plt.xticks([1, 2], ['non-treatment', 'treatment'])
plt.show()

For the revenue in 1974, a large share of the participants seem to have an income of $0, which seems to indicate that there is missing data.

In [None]:
re75_non_treatment = data[data.treat == 0][['re75']].values
re75_treatment = data[data.treat == 1][['re75']].values

plt.boxplot([re75_non_treatment, re75_treatment])
plt.xticks([1, 2], ['non-treatment', 'treatment'])
plt.show()

The results are similar to the ones for 1974.

In [None]:
from sklearn import linear_model
logistic = linear_model.LogisticRegression(max_iter=100, tol=1e-9)

In [None]:
y = data.treat.values
X = data.drop(['id', 'treat', 're78'], axis=1).values
X

In [None]:
logistic.fit(X, y)

In [None]:
probas = logistic.predict_proba(X)

In [None]:
score_dfs = pd.concat((pd.DataFrame(probas)[1], data.treat), axis=1).sort_values([1], ascending = False)
score_dfs

In [None]:
matching = []
last_matched = False
i = 0
for index, row in score_dfs.iterrows():
    if i != 0:
        if not last_matched:
            last = score_dfs.iloc[i - 1]
            if row.treat != last.treat:
                #matching[row.index] = last.index
                matching.append((row.name, last.name))
                last_matched = True
        else:
            last_matched = False
    i += 1
pd.concat((data.loc[pd.DataFrame(matching)[0]]['re78'], data.loc[pd.DataFrame(matching)[1]]['re78']), axis=1)