In [91]:
%matplotlib inline
from pathlib import Path
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
import matplotlib.pylab as plt
import numpy as np

In [92]:
# Working directory:
#
# We assume that data are kept in the same directory as the notebook. If you keep your 
# data in a different folder, replace the argument of the `Path`
DATA = Path('C:\\Users\\tanve\\Documents\\206\\dmba\\')
# DATA = Path('C:/Users/profm/OneDrive/teaching/DS110/dmba/')
# and then load data using 
#
# pd.read_csv(DATA / ‘filename.csv’)

### Hair Care Product—Uplift Modeling. This problem uses the data set in Hair-Care- Product.csv, courtesy of SAS. In this hypothetical case, a promotion for a hair care product was sent to some members of a buyers club. Purchases were then recorded for both the members who got the promotion and those who did not.

In [93]:
hair_df = pd.read_csv(DATA / 'Hair-Care-Product.csv')
hair_df.dtypes

Purchase          int64
Age               int64
Hair Color       object
U.S. Region      object
Validation        int64
Promotion_ord     int64
Gender_ord        int64
Residence_ord     int64
dtype: object

#### a. What is the purchase propensity

In [94]:
tbl = pd.crosstab(hair_df["Purchase"], hair_df["Promotion_ord"])
tbl

Promotion_ord,0,1
Purchase,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4992,4896
1,32,80


In [95]:
propTbl = tbl / tbl.sum()
propTbl.round(2)

Promotion_ord,0,1
Purchase,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.99,0.98
1,0.01,0.02


##### i. among those who received the promotion

2% 

##### ii. among those who did not receive the promotion?

1%

#### b. Partition the data into training (60%) and validation (40%) and fit:

In [96]:
hair_df.columns

Index(['Purchase', 'Age', 'Hair Color', 'U.S. Region', 'Validation',
       'Promotion_ord', 'Gender_ord', 'Residence_ord'],
      dtype='object')

##### i. Uplift using a Random Forest.

In [97]:
hair_df['Hair Color'] = hair_df['Hair Color'].astype('category')
hair_df['U.S. Region'] = hair_df['U.S. Region'].astype('category')

#X = pd.get_dummies(hair_df['Purchase'],prefix_sep='_', drop_first=False)
#X = pd.get_dummies(hair_df.drop(columns=['Purchase', 'Hair Color', 'U.S. Region', 'Validation']),prefix_sep='_', drop_first=False)
X = pd.get_dummies(hair_df.drop(columns=['Purchase', 'Validation']),prefix_sep='_', drop_first=False)
y = hair_df['Purchase'] 
train_X, valid_X, train_y, valid_y = train_test_split(X,y, train_size=0.6, random_state=1)

In [98]:
X.columns

Index(['Age', 'Promotion_ord', 'Gender_ord', 'Residence_ord',
       'Hair Color_Black', 'Hair Color_Blond', 'Hair Color_Brown',
       'Hair Color_Red', 'U.S. Region_Northeast', 'U.S. Region_Northwest',
       'U.S. Region_Southeast', 'U.S. Region_Southwest'],
      dtype='object')

In [99]:
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(train_X, train_y)

In [100]:
uplift_df = valid_X.copy() # Need to create a copy to allow modifying data
uplift_df.Promotion_ord = 1
predTreatment = rf.predict_proba(uplift_df)
uplift_df.Promotion_ord = 0
predControl = rf.predict_proba(uplift_df)
upliftResult_df = pd.DataFrame({
'probPromote': predTreatment[:,1],
'probNoPromote': predControl[:,1],
'uplift': predTreatment[:,1] - predControl[:,1],
}, index=uplift_df.index)
upliftResult_df.head(3)

Unnamed: 0,probPromote,probNoPromote,uplift
9953,0.0,0.0,0.0
3850,0.0,0.0,0.0
4962,0.0,0.0,0.0


##### ii. Uplift using k-NN.

In [101]:
hair_knn_df = pd.get_dummies(hair_df.drop(columns=['Validation']),prefix_sep='_', drop_first=False)
hair_knn_df.columns

Index(['Purchase', 'Age', 'Promotion_ord', 'Gender_ord', 'Residence_ord',
       'Hair Color_Black', 'Hair Color_Blond', 'Hair Color_Brown',
       'Hair Color_Red', 'U.S. Region_Northeast', 'U.S. Region_Northwest',
       'U.S. Region_Southeast', 'U.S. Region_Southwest'],
      dtype='object')

In [102]:
train_df, valid_df= train_test_split(hair_knn_df, train_size=0.6, random_state=1)
predictor = X.columns
zpredictor = ['zAge', 'zPromotion_ord', 'zGender_ord', 'zResidence_ord',
       'zHair Color_Black', 'zHair Color_Blond', 'zHair Color_Brown',
       'zHair Color_Red', 'zU.S. Region_Northeast', 'zU.S. Region_Northwest',
       'zU.S. Region_Southeast', 'zU.S. Region_Southwest']
outcome = 'Purchase'

In [103]:
scaler = preprocessing.StandardScaler()
scaler.fit(train_df[predictor])

In [104]:
# Transform the full dataset
hairNorm = pd.concat([pd.DataFrame(scaler.transform(hair_knn_df[predictor]), 
                                    columns=zpredictor),
                       hair_knn_df[outcome]], axis=1)
hairNorm = hairNorm.dropna()
train_df, valid_df= train_test_split(hairNorm, train_size=0.6, random_state=1)

In [105]:
train_X = train_df[zpredictor]
train_y = train_df[outcome]
valid_X = valid_df[zpredictor]
valid_y = valid_df[outcome]

In [106]:
results = []
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k).fit(train_X, train_y)
    results.append({
        'k': k,
        'accuracy': knn.score(valid_X, valid_y)
        #'accuracy': accuracy_score(valid_y, knn.predict(valid_X))
        #just do a three way split and use test partition as th elast one
    })
    # Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

     k  accuracy
0    1 -1.045827
1    2 -0.451563
2    3 -0.253475
3    4 -0.189746
4    5 -0.131634
5    6 -0.106803
6    7 -0.082757
7    8 -0.072841
8    9 -0.074029
9   10 -0.064219
10  11 -0.060030
11  12 -0.064182
12  13 -0.053753
13  14 -0.045479
14  15 -0.043307
15  16 -0.041027
16  17 -0.040476
17  18 -0.040172
18  19 -0.040211
19  20 -0.036503


In [107]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_X, train_y)
valid_X

Unnamed: 0,zAge,zPromotion_ord,zGender_ord,zResidence_ord,zHair Color_Black,zHair Color_Blond,zHair Color_Brown,zHair Color_Red,zU.S. Region_Northeast,zU.S. Region_Northwest,zU.S. Region_Southeast,zU.S. Region_Southwest
9953,-0.717832,0.998002,1.465590,0.640654,-0.471164,1.281409,-0.777353,-0.259665,-0.586845,-0.577863,-0.563492,1.720580
3850,0.918792,0.998002,-0.682319,0.640654,2.122401,-0.780391,-0.777353,-0.259665,-0.586845,-0.577863,1.774649,-0.581199
4962,-0.976246,0.998002,1.465590,0.640654,-0.471164,1.281409,-0.777353,-0.259665,1.704027,-0.577863,-0.563492,-0.581199
3886,1.349483,0.998002,1.465590,0.640654,-0.471164,-0.780391,1.286416,-0.259665,-0.586845,1.730513,-0.563492,-0.581199
5437,1.349483,0.998002,-0.682319,-1.560906,-0.471164,-0.780391,1.286416,-0.259665,-0.586845,-0.577863,1.774649,-0.581199
...,...,...,...,...,...,...,...,...,...,...,...,...
9274,1.521759,0.998002,-0.682319,-1.560906,-0.471164,-0.780391,1.286416,-0.259665,-0.586845,-0.577863,-0.563492,1.720580
8946,-1.234661,0.998002,1.465590,0.640654,-0.471164,-0.780391,1.286416,-0.259665,1.704027,-0.577863,-0.563492,-0.581199
7637,-0.201003,-1.002002,-0.682319,0.640654,-0.471164,-0.780391,1.286416,-0.259665,1.704027,-0.577863,-0.563492,-0.581199
6377,0.746516,0.998002,-0.682319,0.640654,-0.471164,-0.780391,1.286416,-0.259665,-0.586845,-0.577863,1.774649,-0.581199


In [108]:
uplift_df = valid_X.copy() # Need to create a copy to allow modifying data
uplift_df.zPromotion_ord = 1
predTreatment = knn.predict_proba(uplift_df)
uplift_df.zPromotion_ord = 0
predControl = knn.predict_proba(uplift_df)
upliftResult_df = pd.DataFrame({
'probPromote': predTreatment[:,1],
'probNoPromote': predControl[:,1],
'uplift': predTreatment[:,1] - predControl[:,1],
}, index=uplift_df.index)
upliftResult_df.head(3)

Unnamed: 0,probPromote,probNoPromote,uplift
9953,0.0,0.0,0.0
3850,0.0,0.0,0.0
4962,0.0,0.0,0.0


#### c. Report the two models’ recommendations for the first three members

Both models report that there is a very small level of uplift with a promotion. The first three memembers showed no uplift, but that might be due to the model them selves not going into enough detail. That beign said a direct examination of the number of records shows that there was only 1% of uplift between those who received a promotion and those who did not. 
Given the low amount of return on the promotion, the models lead me to stop giving out promotions. 