In [47]:
%matplotlib inline
from pathlib import Path
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
import matplotlib.pylab as plt
import numpy as np

In [48]:
# Working directory:
#
# We assume that data are kept in the same directory as the notebook. If you keep your 
# data in a different folder, replace the argument of the `Path`
DATA = Path('C:\\Users\\tanve\\Documents\\206\\dmba\\')
# DATA = Path('C:/Users/profm/OneDrive/teaching/DS110/dmba/')
# and then load data using 
#
# pd.read_csv(DATA / ‘filename.csv’)

### Hair Care Product—Uplift Modeling. This problem uses the data set in Hair-Care- Product.csv, courtesy of SAS. In this hypothetical case, a promotion for a hair care product was sent to some members of a buyers club. Purchases were then recorded for both the members who got the promotion and those who did not.

In [49]:
hair_df = pd.read_csv(DATA / 'Hair-Care-Product.csv')
hair_df.dtypes

Purchase          int64
Age               int64
Hair Color       object
U.S. Region      object
Validation        int64
Promotion_ord     int64
Gender_ord        int64
Residence_ord     int64
dtype: object

#### a. What is the purchase propensity

In [50]:
tbl = pd.crosstab(hair_df["Purchase"], hair_df["Promotion_ord"])
tbl

Promotion_ord,0,1
Purchase,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4992,4896
1,32,80


In [51]:
propTbl = tbl / tbl.sum()
propTbl.round(2)

Promotion_ord,0,1
Purchase,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.99,0.98
1,0.01,0.02


##### i. among those who received the promotion

2% 

##### ii. among those who did not receive the promotion?

1%

#### b. Partition the data into training (60%) and validation (40%) and fit:

In [52]:
hair_df.columns

Index(['Purchase', 'Age', 'Hair Color', 'U.S. Region', 'Validation',
       'Promotion_ord', 'Gender_ord', 'Residence_ord'],
      dtype='object')

##### i. Uplift using a Random Forest.

In [53]:
hair_df['Hair Color'] = hair_df['Hair Color'].astype('category')
hair_df['U.S. Region'] = hair_df['U.S. Region'].astype('category')
#hair_df['Promotion_ord'] = hair_df['Promotion_ord'].astype('category')
hair_df['Gender_ord'] = hair_df['Gender_ord'].astype('category')
hair_df['Residence_ord'] = hair_df['Residence_ord'].astype('category')
#X = pd.get_dummies(hair_df['Purchase'],prefix_sep='_', drop_first=False)
#X = pd.get_dummies(hair_df.drop(columns=['Purchase', 'Hair Color', 'U.S. Region', 'Validation']),prefix_sep='_', drop_first=False)
X = pd.get_dummies(hair_df.drop(columns=['Purchase', 'Validation']),prefix_sep='_', drop_first=False)
y = hair_df['Purchase'] 
train_X, valid_X, train_y, valid_y = train_test_split(X,y, train_size=0.6, random_state=1)

In [54]:
X.columns

Index(['Age', 'Promotion_ord', 'Hair Color_Black', 'Hair Color_Blond',
       'Hair Color_Brown', 'Hair Color_Red', 'U.S. Region_Northeast',
       'U.S. Region_Northwest', 'U.S. Region_Southeast',
       'U.S. Region_Southwest', 'Gender_ord_0', 'Gender_ord_1',
       'Residence_ord_0', 'Residence_ord_1'],
      dtype='object')

In [55]:
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(train_X, train_y)

In [56]:
pred = rf.predict(valid_X)
classificationSummary(valid_y, pred)

Confusion Matrix (Accuracy 0.9828)

       Prediction
Actual    0    1
     0 3931   17
     1   52    0


In [57]:
uplift_df = valid_X.copy() # Need to create a copy to allow modifying data
uplift_df.Promotion_ord = 1
predTreatment = rf.predict_proba(uplift_df)
uplift_df.Promotion_ord = 0
predControl = rf.predict_proba(uplift_df)
upliftResult_df = pd.DataFrame({
'probPromote': predTreatment[:,1],
'probNoPromote': predControl[:,1],
'uplift': predTreatment[:,1] - predControl[:,1],
}, index=uplift_df.index)
upliftResult_df = upliftResult_df.sort_values(by=['uplift'], ascending=False)
upliftResult_df.head(3)

Unnamed: 0,probPromote,probNoPromote,uplift
5525,0.67,0.02,0.65
1390,0.686667,0.06,0.626667
1414,0.635833,0.01,0.625833


##### ii. Uplift using k-NN.

In [58]:
hair_knn_df = pd.get_dummies(hair_df.drop(columns=['Validation']),prefix_sep='_', drop_first=False)
hair_knn_df.columns

Index(['Purchase', 'Age', 'Promotion_ord', 'Hair Color_Black',
       'Hair Color_Blond', 'Hair Color_Brown', 'Hair Color_Red',
       'U.S. Region_Northeast', 'U.S. Region_Northwest',
       'U.S. Region_Southeast', 'U.S. Region_Southwest', 'Gender_ord_0',
       'Gender_ord_1', 'Residence_ord_0', 'Residence_ord_1'],
      dtype='object')

In [59]:
train_df, valid_df= train_test_split(hair_knn_df, train_size=0.6, random_state=1)
predictor = X.columns
zpredictor = ['zAge', 'zPromotion_ord', 'zHair Color_Black', 'zHair Color_Blond',
       'zHair Color_Brown', 'zHair Color_Red', 'zU.S. Region_Northeast',
       'zU.S. Region_Northwest', 'zU.S. Region_Southeast',
       'zU.S. Region_Southwest', 'zGender_ord_0', 'zGender_ord_1',
       'zResidence_ord_0', 'zResidence_ord_1']
outcome = 'Purchase'



In [60]:
train_df, valid_df = train_test_split(hair_knn_df, test_size= 0.4, random_state= 1)
kvalues = list(range(1, 21))
accuracy_dict = {}
scaler = preprocessing.StandardScaler()

In [61]:
for k in kvalues:
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    kf = KFold(n_splits= 10, shuffle = True, random_state = 1)
    accuracies = []

    for train_index, valid_index in kf.split(train_df):
        X_train, X_valid = train_df[predictor].iloc[train_index], train_df[predictor].iloc[valid_index]
        y_train, y_valid = train_df[outcome].iloc[train_index], train_df[outcome].iloc[valid_index]
        scaler.fit(X_train)
        X_train_norm = scaler.transform(X_train)
        X_valid_norm = scaler.transform(X_valid)
        knn_classifier.fit(X_train_norm, y_train)
        y_pred = knn_classifier.predict(X_valid_norm)
        accuracy = accuracy_score(y_valid, y_pred)
        accuracies.append(accuracy)
    
    mean_accuracy = np.mean(accuracies)
    accuracy_dict[k] =  mean_accuracy

In [62]:
for k, accuracy in accuracy_dict.items():
    print(f"k = {k}, Accuracy: {accuracy}")

best_k = max(accuracy_dict, key=accuracy_dict.get)
print("best k value:", best_k)

k = 1, Accuracy: 0.9803333333333333
k = 2, Accuracy: 0.9898333333333333
k = 3, Accuracy: 0.9896666666666667
k = 4, Accuracy: 0.9899999999999999
k = 5, Accuracy: 0.9899999999999999
k = 6, Accuracy: 0.9899999999999999
k = 7, Accuracy: 0.9899999999999999
k = 8, Accuracy: 0.9899999999999999
k = 9, Accuracy: 0.9899999999999999
k = 10, Accuracy: 0.9899999999999999
k = 11, Accuracy: 0.9899999999999999
k = 12, Accuracy: 0.9899999999999999
k = 13, Accuracy: 0.9899999999999999
k = 14, Accuracy: 0.9899999999999999
k = 15, Accuracy: 0.9899999999999999
k = 16, Accuracy: 0.9899999999999999
k = 17, Accuracy: 0.9899999999999999
k = 18, Accuracy: 0.9899999999999999
k = 19, Accuracy: 0.9899999999999999
k = 20, Accuracy: 0.9899999999999999
best k value: 4


In [63]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(train_X, train_y)
valid_X

Unnamed: 0,Age,Promotion_ord,Hair Color_Black,Hair Color_Blond,Hair Color_Brown,Hair Color_Red,U.S. Region_Northeast,U.S. Region_Northwest,U.S. Region_Southeast,U.S. Region_Southwest,Gender_ord_0,Gender_ord_1,Residence_ord_0,Residence_ord_1
9953,32,1,False,True,False,False,False,False,False,True,False,True,False,True
3850,51,1,True,False,False,False,False,False,True,False,True,False,False,True
4962,29,1,False,True,False,False,True,False,False,False,False,True,False,True
3886,56,1,False,False,True,False,False,True,False,False,False,True,False,True
5437,56,1,False,False,True,False,False,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9274,58,1,False,False,True,False,False,False,False,True,True,False,True,False
8946,26,1,False,False,True,False,True,False,False,False,False,True,False,True
7637,38,0,False,False,True,False,True,False,False,False,True,False,False,True
6377,49,1,False,False,True,False,False,False,True,False,True,False,False,True


In [64]:
uplift_df = valid_X.copy() # Need to create a copy to allow modifying data
uplift_df.Promotion_ord = 1
predTreatment = knn.predict_proba(uplift_df)
uplift_df.Promotion_ord = 0
predControl = knn.predict_proba(uplift_df)
upliftResult_df = pd.DataFrame({
'probPromote': predTreatment[:,1],
'probNoPromote': predControl[:,1],
'uplift': predTreatment[:,1] - predControl[:,1],
}, index=uplift_df.index)
upliftResult_df = upliftResult_df.sort_values(by=['uplift'], ascending=False)
upliftResult_df.head(3)

Unnamed: 0,probPromote,probNoPromote,uplift
2694,0.5,0.25,0.25
4554,0.25,0.0,0.25
301,0.25,0.0,0.25


#### c. Report the two models’ recommendations for the first three members

The random forrest model shows us an average uplift of about 63% while KNN with n = 4, shows us that the average uplift is 25%. Based on random forrest i would recomend that we send out more promotional material as it has a high return by mosty standards. 
I am a bit more apprehensive about wether or not to recomend the same course of action based on the results of the knn model. I would need more information as to what is the cutoff of uplift for us to recomend sending promotional materials. 