In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

%matplotlib inline 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import RocCurveDisplay

In [104]:
train_df = pd.read_csv('data/Project2Data.csv',low_memory = False)

In [105]:
train_df.shape

(1780140, 17)

In [106]:
train_df.head()

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount,ConvertedTo_RG_Within_6M
0,A-990430482,Unknown,,NSW,2166.0,No,No,0062u00000AFwB5AAL,2020-11-04,,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,56.0,0
1,A-990434539,Unknown,,NSW,2027.0,No,No,0062u00000AZ70jAAD,2020-12-03,,Is_First_Gift,Cash - One off,DM,Appeal,Christmas,140.0,0
2,A-990435870,Unknown,,,,No,No,0062u00000Aa42AAAR,2020-12-16,,Is_First_Gift,Cash - Inspired Gifts,DM,Appeal,Non Seasonal,70.0,0
3,A-990435901,Unknown,,,,No,No,0062u00000Aa997AAB,2020-12-17,,Is_First_Gift,Cash - One off,General,Appeal,Non Seasonal,35.0,0
4,C-990018085,Unknown,,VIC,3144.0,No,No,0062u000009YQXHAA4,2015-05-11,Yes,Is_First_Gift,Cash - One off,Inserts,Appeal,Non Seasonal,175.0,0


In [107]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1780140 entries, 0 to 1780139
Data columns (total 17 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   SupporterID               object 
 1   Age_Bucket                object 
 2   Gender                    object 
 3   State                     object 
 4   PostCode                  float64
 5   Have_Phone                object 
 6   Have_Email                object 
 7   Gift_ID                   object 
 8   GiftDate                  object 
 9   IsEmergencyGift           object 
 10  Is_First_Gift             object 
 11  ProductType_Group         object 
 12  GiftSolicitationChannel   object 
 13  CampaignSubtype_Group     object 
 14  AppealSeason              object 
 15  GiftAmount                float64
 16  ConvertedTo_RG_Within_6M  int64  
dtypes: float64(2), int64(1), object(14)
memory usage: 230.9+ MB


In [108]:
train_df.isna().sum()

SupporterID                       0
Age_Bucket                        0
Gender                       473180
State                         84891
PostCode                     104015
Have_Phone                        0
Have_Email                        0
Gift_ID                           0
GiftDate                          0
IsEmergencyGift             1630762
Is_First_Gift                     0
ProductType_Group                 0
GiftSolicitationChannel       21053
CampaignSubtype_Group             0
AppealSeason                      0
GiftAmount                        0
ConvertedTo_RG_Within_6M          0
dtype: int64

In [109]:
train_df.columns

Index(['SupporterID', 'Age_Bucket', 'Gender', 'State', 'PostCode',
       'Have_Phone', 'Have_Email', 'Gift_ID', 'GiftDate', 'IsEmergencyGift',
       'Is_First_Gift', 'ProductType_Group', 'GiftSolicitationChannel',
       'CampaignSubtype_Group', 'AppealSeason', 'GiftAmount',
       'ConvertedTo_RG_Within_6M'],
      dtype='object')

In [110]:
train_df.Age_Bucket.value_counts()

Age_Bucket
Unknown    896559
31-40      243238
41-50      193695
51-60      144225
71+        130145
61-70      113465
19 - 30     58813
Name: count, dtype: int64

In [111]:
train_df.nlargest(10, 'GiftAmount') 

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount,ConvertedTo_RG_Within_6M
1558484,C-990438993,Unknown,,VIC,3175.0,No,Yes,0062u00000CcDtRAAV,2021-06-08,,Not_First_Gift,Cash - One off,,Appeal,Tax,52500.0,0
1558485,C-990438993,Unknown,,VIC,3175.0,No,Yes,0062u00000Cb4a8AAB,2021-06-08,,Not_First_Gift,Cash - One off,General,Appeal,Tax,52500.0,0
249432,C-990281901,Unknown,Female,QLD,4122.0,Yes,Yes,0062u000009T6FkAAK,2019-06-30,,Not_First_Gift,Cash - One off,Web,Appeal,Tax,28000.0,0
1570719,C-990441507,Unknown,,,,No,Yes,0062u00000C287QAAR,2021-03-17,,Is_First_Gift,Cash - One off,General,Appeal,Non Seasonal,17500.0,0
249433,C-990281901,Unknown,Female,QLD,4122.0,Yes,Yes,0062u000009T6FmAAK,2019-06-30,,Not_First_Gift,Cash - One off,Web,Appeal,Tax,14000.0,0
249434,C-990281901,Unknown,Female,QLD,4122.0,Yes,Yes,0062u000009T6FlAAK,2019-06-30,,Not_First_Gift,Cash - One off,Web,Appeal,Tax,14000.0,0
249435,C-990281901,Unknown,Female,QLD,4122.0,Yes,Yes,0062u000009T6FnAAK,2019-06-30,,Not_First_Gift,Cash - One off,Web,Appeal,Tax,14000.0,0
1313778,C-990393107,Unknown,Female,NSW,2047.0,Yes,Yes,0062u00000DCxSWAA1,2021-10-14,,Not_First_Gift,Cash - One off,DM,Appeal,Autumn,14000.0,0
1604277,C-990454390,Unknown,,QLD,4061.0,Yes,Yes,0062u00000ENmTnAAL,2022-01-31,Yes,Not_First_Gift,Cash - One off,,Appeal,Non Seasonal,14000.0,0
15013,C-990244714,Unknown,Female,NSW,2067.0,No,Yes,0062u0000092fGmAAI,2014-06-05,,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,11200.0,0


In [112]:
train_df.IsEmergencyGift.value_counts()

IsEmergencyGift
Yes    149378
Name: count, dtype: int64

# Data Processing

In [113]:
df = pd.read_csv(filepath_or_buffer="data/Project2Data.csv",
                 low_memory=False,  
                 parse_dates=["GiftDate"]) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1780140 entries, 0 to 1780139
Data columns (total 17 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   SupporterID               object        
 1   Age_Bucket                object        
 2   Gender                    object        
 3   State                     object        
 4   PostCode                  float64       
 5   Have_Phone                object        
 6   Have_Email                object        
 7   Gift_ID                   object        
 8   GiftDate                  datetime64[ns]
 9   IsEmergencyGift           object        
 10  Is_First_Gift             object        
 11  ProductType_Group         object        
 12  GiftSolicitationChannel   object        
 13  CampaignSubtype_Group     object        
 14  AppealSeason              object        
 15  GiftAmount                float64       
 16  ConvertedTo_RG_Within_6M  int64         
dtypes: datet

In [114]:
dup_vals = df["SupporterID"].value_counts()[lambda s: s > 1]
dup_vals

SupporterID
C-990323876    558
C-990325291    469
C-990521347    463
C-990416858    346
C-990285240    325
              ... 
C-990512098      2
C-990346064      2
C-990471567      2
C-990512090      2
C-990346363      2
Name: count, Length: 87367, dtype: int64

In [115]:
col = 'Age_Bucket'
idx_unk = train_df.index[df[col] == 'Unknown']
counts  = train_df.loc[df[col] != 'Unknown', col].value_counts()
cats    = counts.index.to_list()
probs   = (counts / counts.sum()).reindex(cats).values

rng = np.random.default_rng(42)  
choices = rng.choice(cats, size=len(idx_unk), p=probs)
train_df.loc[idx_unk, col] = choices
train_df = train_df.loc[train_df[col].notna() & (train_df[col] != 'Unknown')].copy()

In [116]:
train_df.Age_Bucket.value_counts()

Age_Bucket
31-40      489902
41-50      390372
51-60      290811
71+        261985
61-70      228778
19 - 30    118292
Name: count, dtype: int64

In [117]:
train_df.State.value_counts()

State
NSW        582239
VIC        404205
WA         232220
QLD        214292
ACT        160259
            ...  
18              1
9               1
london          1
DU              1
Gujarat         1
Name: count, Length: 229, dtype: int64

In [118]:
train_df['State'] = train_df['State'].replace({'Nsw':'NSW'})
train_df['State'] = train_df['State'].replace({'Qld':'QLD'})
train_df['State'] = train_df['State'].replace({'Vic':'VIC'})
train_df['State'] = train_df['State'].replace({'Tas':'TAS'})

In [119]:
top9_states = train_df["State"].value_counts().head(9).index
ct_top9 = pd.crosstab(train_df.loc[train_df["State"].isin(top9_states), "State"],
                      train_df["ConvertedTo_RG_Within_6M"])
# 按出现频次顺序展示：
states_top9 = ct_top9.reindex(top9_states)
states_top9

ConvertedTo_RG_Within_6M,0,1
State,Unnamed: 1_level_1,Unnamed: 2_level_1
NSW,531872,50371
VIC,360917,43298
WA,218530,13690
QLD,195359,18939
ACT,156024,4235
SA,45941,7105
TAS,25260,2392
NT,11309,681
INT,6920,358


In [120]:
top9 = df["State"].value_counts().nlargest(9).index
# 生成新列：前9保留，其他归为 Other
train_df["State"] = train_df["State"].where(train_df["State"].isin(top9), "Other")

# （可选）固定显示顺序：前9 + Other
order = list(top9) + ["Other"]
train_df["State"] = pd.Categorical(train_df["State"], categories=order, ordered=True)

In [121]:
train_df.State.value_counts()

State
NSW      582243
VIC      404215
WA       232220
QLD      214298
ACT      160259
Other     86939
SA        53046
TAS       27652
NT        11990
INT        7278
Name: count, dtype: int64

In [122]:
train_df.head()

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount,ConvertedTo_RG_Within_6M
0,A-990430482,71+,,NSW,2166.0,No,No,0062u00000AFwB5AAL,2020-11-04,,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,56.0,0
1,A-990434539,41-50,,NSW,2027.0,No,No,0062u00000AZ70jAAD,2020-12-03,,Is_First_Gift,Cash - One off,DM,Appeal,Christmas,140.0,0
2,A-990435870,61-70,,Other,,No,No,0062u00000Aa42AAAR,2020-12-16,,Is_First_Gift,Cash - Inspired Gifts,DM,Appeal,Non Seasonal,70.0,0
3,A-990435901,71+,,Other,,No,No,0062u00000Aa997AAB,2020-12-17,,Is_First_Gift,Cash - One off,General,Appeal,Non Seasonal,35.0,0
4,C-990018085,31-40,,VIC,3144.0,No,No,0062u000009YQXHAA4,2015-05-11,Yes,Is_First_Gift,Cash - One off,Inserts,Appeal,Non Seasonal,175.0,0


In [123]:
train_df.isna().sum()

SupporterID                       0
Age_Bucket                        0
Gender                       473180
State                             0
PostCode                     104015
Have_Phone                        0
Have_Email                        0
Gift_ID                           0
GiftDate                          0
IsEmergencyGift             1630762
Is_First_Gift                     0
ProductType_Group                 0
GiftSolicitationChannel       21053
CampaignSubtype_Group             0
AppealSeason                      0
GiftAmount                        0
ConvertedTo_RG_Within_6M          0
dtype: int64

In [124]:
train_df["PostCode"] = train_df["PostCode"].fillna("").astype(str).str.split('.').str[0]

In [125]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1780140 entries, 0 to 1780139
Data columns (total 17 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   SupporterID               object  
 1   Age_Bucket                object  
 2   Gender                    object  
 3   State                     category
 4   PostCode                  object  
 5   Have_Phone                object  
 6   Have_Email                object  
 7   Gift_ID                   object  
 8   GiftDate                  object  
 9   IsEmergencyGift           object  
 10  Is_First_Gift             object  
 11  ProductType_Group         object  
 12  GiftSolicitationChannel   object  
 13  CampaignSubtype_Group     object  
 14  AppealSeason              object  
 15  GiftAmount                float64 
 16  ConvertedTo_RG_Within_6M  int64   
dtypes: category(1), float64(1), int64(1), object(14)
memory usage: 297.1+ MB


Mosaic Data

In [126]:
postcode_df = pd.read_csv('data/PostcodeData.csv')

In [127]:
postcode_df.head()

Unnamed: 0,POSTCODE,TOTAL_HOUSEHOLD_COUNT,STATE,DOMINANT_MOSAIC_GROUP,DOMINANT_MOSAIC_TYPE,MOSAIC_GROUP_A,MOSAIC_GROUP_B,MOSAIC_GROUP_C,MOSAIC_GROUP_D,MOSAIC_GROUP_E,...,MOSAIC_TYPE_L43,MOSAIC_TYPE_L44,MOSAIC_TYPE_M45,MOSAIC_TYPE_M46,MOSAIC_TYPE_M47,MOSAIC_TYPE_M48,MOSAIC_TYPE_N49,MOSAIC_TYPE_N50,MOSAIC_TYPE_N51,MOSAIC_TYPE_N52
0,800,4179,NT,G,G22,139,1659,0,0,89,...,0,0,0,0,0,0,0,0,0,66
1,810,13511,NT,E,F17,370,599,1749,114,3165,...,0,10,190,41,4,2,0,246,101,111
2,812,6979,NT,F,H26,109,9,421,38,414,...,0,71,154,120,0,0,4,133,46,2
3,820,9665,NT,B,G22,702,2269,1040,204,1283,...,0,0,49,3,0,9,25,102,112,78
4,822,3642,NT,J,J35,52,0,0,80,48,...,574,135,69,7,49,76,81,78,0,0


In [128]:
postcode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2607 entries, 0 to 2606
Data columns (total 71 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   POSTCODE               2607 non-null   int64 
 1   TOTAL_HOUSEHOLD_COUNT  2607 non-null   int64 
 2   STATE                  2607 non-null   object
 3   DOMINANT_MOSAIC_GROUP  2607 non-null   object
 4   DOMINANT_MOSAIC_TYPE   2607 non-null   object
 5   MOSAIC_GROUP_A         2607 non-null   int64 
 6   MOSAIC_GROUP_B         2607 non-null   int64 
 7   MOSAIC_GROUP_C         2607 non-null   int64 
 8   MOSAIC_GROUP_D         2607 non-null   int64 
 9   MOSAIC_GROUP_E         2607 non-null   int64 
 10  MOSAIC_GROUP_F         2607 non-null   int64 
 11  MOSAIC_GROUP_G         2607 non-null   int64 
 12  MOSAIC_GROUP_H         2607 non-null   int64 
 13  MOSAIC_GROUP_I         2607 non-null   int64 
 14  MOSAIC_GROUP_J         2607 non-null   int64 
 15  MOSAIC_GROUP_K       

In [129]:
train_df["PostCode"] = train_df["PostCode"].astype(str)

In [130]:
postcode_df["PostCode"] = postcode_df["POSTCODE"].astype(str)

In [131]:
train_df_merged = train_df.merge(postcode_df,on="PostCode", how="left")

In [132]:
train_df_merged

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,...,MOSAIC_TYPE_L43,MOSAIC_TYPE_L44,MOSAIC_TYPE_M45,MOSAIC_TYPE_M46,MOSAIC_TYPE_M47,MOSAIC_TYPE_M48,MOSAIC_TYPE_N49,MOSAIC_TYPE_N50,MOSAIC_TYPE_N51,MOSAIC_TYPE_N52
0,A-990430482,71+,,NSW,2166,No,No,0062u00000AFwB5AAL,2020-11-04,,...,0.0,0.0,0.0,1.0,0.0,0.0,53.0,0.0,48.0,548.0
1,A-990434539,41-50,,NSW,2027,No,No,0062u00000AZ70jAAD,2020-12-03,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A-990435870,61-70,,Other,,No,No,0062u00000Aa42AAAR,2020-12-16,,...,,,,,,,,,,
3,A-990435901,71+,,Other,,No,No,0062u00000Aa997AAB,2020-12-17,,...,,,,,,,,,,
4,C-990018085,31-40,,VIC,3144,No,No,0062u000009YQXHAA4,2015-05-11,Yes,...,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,126.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780135,N_990621275,71+,,VIC,3008,Yes,Yes,0062u00000AEnGoAAL,2020-10-16,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1780136,N_990621277,41-50,,VIC,3174,Yes,Yes,0062u00000ADfutAAD,2020-10-03,,...,0.0,0.0,85.0,0.0,0.0,14.0,48.0,2.0,36.0,370.0
1780137,N_990621277,41-50,,VIC,3174,Yes,Yes,0062u00000DCmPSAA1,2021-10-05,,...,0.0,0.0,85.0,0.0,0.0,14.0,48.0,2.0,36.0,370.0
1780138,N_990621277,31-40,,VIC,3174,Yes,Yes,0068v000002X5VjAAK,2023-01-16,Yes,...,0.0,0.0,85.0,0.0,0.0,14.0,48.0,2.0,36.0,370.0


In [133]:
unicef_df = train_df

In [134]:
unicef_df['PostCode'] = unicef_df['PostCode'].astype(str)

In [135]:
postcode_df['POSTCODE'] = postcode_df['POSTCODE'].astype(str)

In [136]:
mosaic_cols = [c for c in postcode_df.columns if c.startswith("MOSAIC_GROUP")]
postcode_df[mosaic_cols] = postcode_df[mosaic_cols].apply(pd.to_numeric, errors='coerce')
postcode_df["MOSAIC_GROUP"] = postcode_df[mosaic_cols].idxmax(axis=1)

In [137]:
mosaic_cols = [c for c in postcode_df.columns if c.startswith("MOSAIC_TYPE")]
postcode_df[mosaic_cols] = postcode_df[mosaic_cols].apply(pd.to_numeric, errors='coerce')
postcode_df["MOSAIC_TYPE"] = postcode_df[mosaic_cols].idxmax(axis=1)

In [138]:
postcode_mosaic = postcode_df[["POSTCODE", "MOSAIC_TYPE", "MOSAIC_GROUP"]]

In [139]:
unicef_df = unicef_df.merge(postcode_mosaic,
                            left_on="PostCode", 
                            right_on="POSTCODE", 
                            how="left")

In [140]:
unicef_df["MOSAIC_GROUP"].value_counts()

MOSAIC_GROUP
MOSAIC_GROUP_A    238443
MOSAIC_GROUP_B    228428
MOSAIC_GROUP_G    177086
MOSAIC_GROUP_C    174762
MOSAIC_GROUP_F    152905
MOSAIC_GROUP_E    129230
MOSAIC_GROUP_K    102880
MOSAIC_GROUP_J     99176
MOSAIC_GROUP_M     97746
MOSAIC_GROUP_H     81298
MOSAIC_GROUP_I     81239
MOSAIC_GROUP_D     53727
MOSAIC_GROUP_L     45937
MOSAIC_GROUP_N      2515
Name: count, dtype: int64

In [141]:
unicef_df.head()

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount,ConvertedTo_RG_Within_6M,POSTCODE,MOSAIC_TYPE,MOSAIC_GROUP
0,A-990430482,71+,,NSW,2166.0,No,No,0062u00000AFwB5AAL,2020-11-04,,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,56.0,0,2166.0,MOSAIC_TYPE_F20,MOSAIC_GROUP_F
1,A-990434539,41-50,,NSW,2027.0,No,No,0062u00000AZ70jAAD,2020-12-03,,Is_First_Gift,Cash - One off,DM,Appeal,Christmas,140.0,0,2027.0,MOSAIC_TYPE_B06,MOSAIC_GROUP_B
2,A-990435870,61-70,,Other,,No,No,0062u00000Aa42AAAR,2020-12-16,,Is_First_Gift,Cash - Inspired Gifts,DM,Appeal,Non Seasonal,70.0,0,,,
3,A-990435901,71+,,Other,,No,No,0062u00000Aa997AAB,2020-12-17,,Is_First_Gift,Cash - One off,General,Appeal,Non Seasonal,35.0,0,,,
4,C-990018085,31-40,,VIC,3144.0,No,No,0062u000009YQXHAA4,2015-05-11,Yes,Is_First_Gift,Cash - One off,Inserts,Appeal,Non Seasonal,175.0,0,3144.0,MOSAIC_TYPE_A01,MOSAIC_GROUP_A


In [142]:
train_df = unicef_df
train_df

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount,ConvertedTo_RG_Within_6M,POSTCODE,MOSAIC_TYPE,MOSAIC_GROUP
0,A-990430482,71+,,NSW,2166,No,No,0062u00000AFwB5AAL,2020-11-04,,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,56.0,0,2166,MOSAIC_TYPE_F20,MOSAIC_GROUP_F
1,A-990434539,41-50,,NSW,2027,No,No,0062u00000AZ70jAAD,2020-12-03,,Is_First_Gift,Cash - One off,DM,Appeal,Christmas,140.0,0,2027,MOSAIC_TYPE_B06,MOSAIC_GROUP_B
2,A-990435870,61-70,,Other,,No,No,0062u00000Aa42AAAR,2020-12-16,,Is_First_Gift,Cash - Inspired Gifts,DM,Appeal,Non Seasonal,70.0,0,,,
3,A-990435901,71+,,Other,,No,No,0062u00000Aa997AAB,2020-12-17,,Is_First_Gift,Cash - One off,General,Appeal,Non Seasonal,35.0,0,,,
4,C-990018085,31-40,,VIC,3144,No,No,0062u000009YQXHAA4,2015-05-11,Yes,Is_First_Gift,Cash - One off,Inserts,Appeal,Non Seasonal,175.0,0,3144,MOSAIC_TYPE_A01,MOSAIC_GROUP_A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780135,N_990621275,71+,,VIC,3008,Yes,Yes,0062u00000AEnGoAAL,2020-10-16,,Is_First_Gift,Community Fundraising,,Community Fundraising,Non Seasonal,70.0,0,3008,MOSAIC_TYPE_B07,MOSAIC_GROUP_G
1780136,N_990621277,41-50,,VIC,3174,Yes,Yes,0062u00000ADfutAAD,2020-10-03,,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,7.0,0,3174,MOSAIC_TYPE_H27,MOSAIC_GROUP_H
1780137,N_990621277,41-50,,VIC,3174,Yes,Yes,0062u00000DCmPSAA1,2021-10-05,,Not_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,7.0,0,3174,MOSAIC_TYPE_H27,MOSAIC_GROUP_H
1780138,N_990621277,31-40,,VIC,3174,Yes,Yes,0068v000002X5VjAAK,2023-01-16,Yes,Not_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,7.0,0,3174,MOSAIC_TYPE_H27,MOSAIC_GROUP_H


In [143]:
train_df.isna().sum()

SupporterID                       0
Age_Bucket                        0
Gender                       473180
State                             0
PostCode                          0
Have_Phone                        0
Have_Email                        0
Gift_ID                           0
GiftDate                          0
IsEmergencyGift             1630762
Is_First_Gift                     0
ProductType_Group                 0
GiftSolicitationChannel       21053
CampaignSubtype_Group             0
AppealSeason                      0
GiftAmount                        0
ConvertedTo_RG_Within_6M          0
POSTCODE                     114768
MOSAIC_TYPE                  114768
MOSAIC_GROUP                 114768
dtype: int64

In [144]:
train_df['GiftSolicitationChannel'] = train_df['GiftSolicitationChannel'].fillna('No or unknown')

In [145]:
train_df['IsEmergencyGift'] = train_df['IsEmergencyGift'].fillna('No or unknown')

In [146]:
train_df['MOSAIC_GROUP'] = train_df['MOSAIC_GROUP'].fillna('Unknown')

In [147]:
train_df['MOSAIC_TYPE'] = train_df['MOSAIC_TYPE'].fillna('Unknown')

In [148]:
train_df['Gender'] = train_df['Gender'].fillna('Unknown')

In [149]:
train_df.Gender.value_counts()

Gender
Female               700209
Male                 606640
Unknown              473180
Non-Binary               97
Prefer Not To Say        14
Name: count, dtype: int64

In [150]:
col = 'Gender'
idx_unk = train_df.index[train_df[col] == 'Unknown']
counts  = train_df.loc[train_df[col] != 'Unknown', col].value_counts()
cats    = counts.index.to_list()
probs   = (counts / counts.sum()).reindex(cats).values

rng = np.random.default_rng(42)  
choices = rng.choice(cats, size=len(idx_unk), p=probs)
train_df.loc[idx_unk, col] = choices
train_df = train_df.loc[train_df[col].notna() & (train_df[col] != 'Unknown')].copy()

In [151]:
train_df.Gender.value_counts()

Gender
Female               954236
Male                 825755
Non-Binary              132
Prefer Not To Say        17
Name: count, dtype: int64

In [152]:
train_df.drop(columns=["POSTCODE"], inplace=True)

In [153]:
train_df.isna().sum()

SupporterID                 0
Age_Bucket                  0
Gender                      0
State                       0
PostCode                    0
Have_Phone                  0
Have_Email                  0
Gift_ID                     0
GiftDate                    0
IsEmergencyGift             0
Is_First_Gift               0
ProductType_Group           0
GiftSolicitationChannel     0
CampaignSubtype_Group       0
AppealSeason                0
GiftAmount                  0
ConvertedTo_RG_Within_6M    0
MOSAIC_TYPE                 0
MOSAIC_GROUP                0
dtype: int64

In [154]:
Is_First_Gift = 'Is_First_Gift'           
df_first_gift = train_df[train_df['Is_First_Gift'].eq(Is_First_Gift)]
df_first_gift

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount,ConvertedTo_RG_Within_6M,MOSAIC_TYPE,MOSAIC_GROUP
0,A-990430482,71+,Male,NSW,2166,No,No,0062u00000AFwB5AAL,2020-11-04,No or unknown,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,56.0,0,MOSAIC_TYPE_F20,MOSAIC_GROUP_F
1,A-990434539,41-50,Female,NSW,2027,No,No,0062u00000AZ70jAAD,2020-12-03,No or unknown,Is_First_Gift,Cash - One off,DM,Appeal,Christmas,140.0,0,MOSAIC_TYPE_B06,MOSAIC_GROUP_B
2,A-990435870,61-70,Male,Other,,No,No,0062u00000Aa42AAAR,2020-12-16,No or unknown,Is_First_Gift,Cash - Inspired Gifts,DM,Appeal,Non Seasonal,70.0,0,Unknown,Unknown
3,A-990435901,71+,Male,Other,,No,No,0062u00000Aa997AAB,2020-12-17,No or unknown,Is_First_Gift,Cash - One off,General,Appeal,Non Seasonal,35.0,0,Unknown,Unknown
4,C-990018085,31-40,Female,VIC,3144,No,No,0062u000009YQXHAA4,2015-05-11,Yes,Is_First_Gift,Cash - One off,Inserts,Appeal,Non Seasonal,175.0,0,MOSAIC_TYPE_A01,MOSAIC_GROUP_A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780132,N_990621270,19 - 30,Female,NSW,2212,Yes,Yes,0062u000009gjx5AAA,2020-08-18,Yes,Is_First_Gift,Cash - One off,Web,Appeal,Spring,52.5,0,MOSAIC_TYPE_C10,MOSAIC_GROUP_C
1780133,N_990621271,41-50,Female,VIC,3194,Yes,Yes,0062u000009hsXnAAI,2020-09-01,Yes,Is_First_Gift,Cash - One off,Web,Appeal,Spring,105.0,0,MOSAIC_TYPE_I30,MOSAIC_GROUP_I
1780134,N_990621272,51-60,Female,VIC,3075,Yes,Yes,0062u000009fhr5AAA,2020-08-06,Yes,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,7.0,0,MOSAIC_TYPE_J31,MOSAIC_GROUP_J
1780135,N_990621275,71+,Female,VIC,3008,Yes,Yes,0062u00000AEnGoAAL,2020-10-16,No or unknown,Is_First_Gift,Community Fundraising,No or unknown,Community Fundraising,Non Seasonal,70.0,0,MOSAIC_TYPE_B07,MOSAIC_GROUP_G


In [155]:
df_first_gift.ConvertedTo_RG_Within_6M.value_counts()

ConvertedTo_RG_Within_6M
0    187181
1      3369
Name: count, dtype: int64

### Building Model

In [156]:
df_first_gift.info()

<class 'pandas.core.frame.DataFrame'>
Index: 190550 entries, 0 to 1780136
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   SupporterID               190550 non-null  object  
 1   Age_Bucket                190550 non-null  object  
 2   Gender                    190550 non-null  object  
 3   State                     190550 non-null  category
 4   PostCode                  190550 non-null  object  
 5   Have_Phone                190550 non-null  object  
 6   Have_Email                190550 non-null  object  
 7   Gift_ID                   190550 non-null  object  
 8   GiftDate                  190550 non-null  object  
 9   IsEmergencyGift           190550 non-null  object  
 10  Is_First_Gift             190550 non-null  object  
 11  ProductType_Group         190550 non-null  object  
 12  GiftSolicitationChannel   190550 non-null  object  
 13  CampaignSubtype_Group     190550 

In [171]:
for label, content in df_first_gift.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

SupporterID
Age_Bucket
Gender
State
PostCode
Have_Phone
Have_Email
Gift_ID
GiftDate
IsEmergencyGift
Is_First_Gift
ProductType_Group
GiftSolicitationChannel
CampaignSubtype_Group
AppealSeason
MOSAIC_TYPE
MOSAIC_GROUP


In [172]:
df_first_gift.head().isna()

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount,ConvertedTo_RG_Within_6M,MOSAIC_TYPE,MOSAIC_GROUP
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
50,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
54,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
67,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [78]:
for label, content in df_first_gift.items():
    if pd.api.types.is_string_dtype(content):
        # Check datatype of target column
        column_datatype = df[label].dtype.name

        # Get random sample from column values
        example_value = content.sample(1).values

        # Infer random sample datatype
        example_value_dtype = pd.api.types.infer_dtype(example_value)
        print(f"Column name: {label} | Column dtype: {column_datatype} | Example value: {example_value} | Example value dtype: {example_value_dtype}")

Column name: SupporterID | Column dtype: object | Example value: ['C-990497923'] | Example value dtype: string
Column name: Age_Bucket | Column dtype: object | Example value: ['61-70'] | Example value dtype: string
Column name: Gender | Column dtype: object | Example value: ['Female'] | Example value dtype: string
Column name: State | Column dtype: category | Example value: ['QLD']
Categories (10, object): ['NSW' < 'VIC' < 'WA' < 'QLD' ... 'TAS' < 'NT' < 'INT' < 'Other'] | Example value dtype: categorical
Column name: PostCode | Column dtype: object | Example value: ['2145'] | Example value dtype: string
Column name: Have_Phone | Column dtype: object | Example value: ['Yes'] | Example value dtype: string
Column name: Have_Email | Column dtype: object | Example value: ['No'] | Example value dtype: string
Column name: Gift_ID | Column dtype: object | Example value: ['0062u000009PlFrAAK'] | Example value dtype: string
Column name: IsEmergencyGift | Column dtype: object | Example value: ['

In [79]:
# Start a count of how many object type columns there are
number_of_object_type_columns = 0

for label, content in df_first_gift.items():
    # Check to see if column is of object type (this will include the string columns)
    if pd.api.types.is_object_dtype(content): 
        # Check datatype of target column
        column_datatype = df[label].dtype.name

        # Get random sample from column values
        example_value = content.sample(1).values

        # Infer random sample datatype
        example_value_dtype = pd.api.types.infer_dtype(example_value)
        print(f"Column name: {label} | Column dtype: {column_datatype} | Example value: {example_value} | Example value dtype: {example_value_dtype}")

        number_of_object_type_columns += 1

print(f"\n[INFO] Total number of object type columns: {number_of_object_type_columns}")

Column name: SupporterID | Column dtype: object | Example value: ['C-990458166'] | Example value dtype: string
Column name: Age_Bucket | Column dtype: object | Example value: ['31-40'] | Example value dtype: string
Column name: Gender | Column dtype: object | Example value: ['Female'] | Example value dtype: string
Column name: PostCode | Column dtype: object | Example value: ['7277'] | Example value dtype: string
Column name: Have_Phone | Column dtype: object | Example value: ['No'] | Example value dtype: string
Column name: Have_Email | Column dtype: object | Example value: ['Yes'] | Example value dtype: string
Column name: Gift_ID | Column dtype: object | Example value: ['0062u000009We80AAC'] | Example value dtype: string
Column name: IsEmergencyGift | Column dtype: object | Example value: ['Yes'] | Example value dtype: string
Column name: Is_First_Gift | Column dtype: object | Example value: ['Is_First_Gift'] | Example value dtype: string
Column name: ProductType_Group | Column dtyp

In [80]:
# This will turn all of the object columns into category values
for label, content in df_first_gift.items(): 
    if pd.api.types.is_object_dtype(content):
        df_first_gift[label] = df_first_gift[label].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_gift[label] = df_first_gift[label].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_gift[label] = df_first_gift[label].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_gift[label] = df_first_gift[label].astype("category")
A value is tryin

In [81]:
df_first_gift.info()

<class 'pandas.core.frame.DataFrame'>
Index: 190550 entries, 0 to 1780136
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   SupporterID               190550 non-null  category      
 1   Age_Bucket                190550 non-null  category      
 2   Gender                    190550 non-null  category      
 3   State                     190550 non-null  category      
 4   PostCode                  190550 non-null  category      
 5   Have_Phone                190550 non-null  category      
 6   Have_Email                190550 non-null  category      
 7   Gift_ID                   190550 non-null  category      
 8   GiftDate                  190550 non-null  datetime64[ns]
 9   IsEmergencyGift           190550 non-null  category      
 10  Is_First_Gift             190550 non-null  category      
 11  ProductType_Group         190550 non-null  category      
 12  GiftSo

In [85]:
df_first_gift.State.cat.categories

Index(['NSW', 'VIC', 'WA', 'QLD', 'ACT', 'SA', 'TAS', 'NT', 'INT', 'Other'], dtype='object')

In [86]:
# Inspect the category codes
df_first_gift.State.cat.codes

0          0
1          0
2          9
3          9
4          1
          ..
1780132    0
1780133    1
1780134    1
1780135    1
1780136    1
Length: 190550, dtype: int8

In [87]:
target_state_cat_number = 9
target_state_cat_value = df_first_gift.State.cat.categories[target_state_cat_number] 
print(f"[INFO] Target state category number {target_state_cat_number} maps to: {target_state_cat_value}")

[INFO] Target state category number 9 maps to: Other


In [88]:
for label, content in df_first_gift.items():
    if pd.api.types.is_object_dtype(content):
        # Turn object columns into category datatype
        df_first_gift[label] = df_first_gift[label].astype("category")

In [89]:
# Check missing values
df_first_gift.isna().sum().sort_values(ascending=False)[:20]

SupporterID                 0
Is_First_Gift               0
MOSAIC_TYPE                 0
ConvertedTo_RG_Within_6M    0
GiftAmount                  0
AppealSeason                0
CampaignSubtype_Group       0
GiftSolicitationChannel     0
ProductType_Group           0
IsEmergencyGift             0
Age_Bucket                  0
GiftDate                    0
Gift_ID                     0
Have_Email                  0
Have_Phone                  0
PostCode                    0
State                       0
Gender                      0
MOSAIC_GROUP                0
dtype: int64

In [90]:
# Find numeric columns 
for label, content in df_first_gift.items():
    if pd.api.types.is_numeric_dtype(content):
        # Check datatype of target column
        column_datatype = df_first_gift[label].dtype.name

        # Get random sample from column values
        example_value = content.sample(1).values

        # Infer random sample datatype
        example_value_dtype = pd.api.types.infer_dtype(example_value)
        print(f"Column name: {label} | Column dtype: {column_datatype} | Example value: {example_value} | Example value dtype: {example_value_dtype}")

Column name: GiftAmount | Column dtype: float64 | Example value: [137.2] | Example value dtype: floating
Column name: ConvertedTo_RG_Within_6M | Column dtype: int64 | Example value: [0] | Example value dtype: integer


In [91]:
# Check for which numeric columns have null values
for label, content in df_first_gift.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(f"Column name: {label} | Has missing values: {True}")
        else:
            print(f"Column name: {label} | Has missing values: {False}")

Column name: GiftAmount | Has missing values: False
Column name: ConvertedTo_RG_Within_6M | Has missing values: False


In [92]:
print(f"[INFO] Columns which are not numeric:")
for label, content in df_first_gift.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(f"Column name: {label} | Column dtype: {df_first_gift[label].dtype.name}")

[INFO] Columns which are not numeric:
Column name: SupporterID | Column dtype: category
Column name: Age_Bucket | Column dtype: category
Column name: Gender | Column dtype: category
Column name: State | Column dtype: category
Column name: PostCode | Column dtype: category
Column name: Have_Phone | Column dtype: category
Column name: Have_Email | Column dtype: category
Column name: Gift_ID | Column dtype: category
Column name: GiftDate | Column dtype: datetime64[ns]
Column name: IsEmergencyGift | Column dtype: category
Column name: Is_First_Gift | Column dtype: category
Column name: ProductType_Group | Column dtype: category
Column name: GiftSolicitationChannel | Column dtype: category
Column name: CampaignSubtype_Group | Column dtype: category
Column name: AppealSeason | Column dtype: category
Column name: MOSAIC_TYPE | Column dtype: category
Column name: MOSAIC_GROUP | Column dtype: category


In [93]:
# 1. Create a dictionary to store column to category values (e.g. we turn our category types into numbers but we keep a record so we can go back)
column_to_category_dict = {} 

# 2. Turn categorical variables into numbers
for label, content in df_first_gift.items():

    # 3. Check columns which *aren't* numeric
    if not pd.api.types.is_numeric_dtype(content):

        # 4. Add binary column to inidicate whether sample had missing value
        df_first_gift[label+"_is_missing"] = pd.isnull(content).astype(int)

        # 5. Ensure content is categorical and get its category codes
        content_categories = pd.Categorical(content)
        content_category_codes = content_categories.codes + 1 # prevents -1 (the default for NaN values) from being used for missing values (we'll treat missing values as 0)

        # 6. Add column key to dictionary with code: category mapping per column
        column_to_category_dict[label] = dict(zip(content_category_codes, content_categories))
        
        # 7. Set the column to the numerical values (the category code value) 
        df_first_gift[label] = content_category_codes  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_gift[label+"_is_missing"] = pd.isnull(content).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_gift[label] = content_category_codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_gift[label+"_is_missing"] = pd.isnull(content).astype(int)
A value is trying to be

In [94]:
df_first_gift.info()

<class 'pandas.core.frame.DataFrame'>
Index: 190550 entries, 0 to 1780136
Data columns (total 36 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   SupporterID                         190550 non-null  int32  
 1   Age_Bucket                          190550 non-null  int8   
 2   Gender                              190550 non-null  int8   
 3   State                               190550 non-null  int8   
 4   PostCode                            190550 non-null  int16  
 5   Have_Phone                          190550 non-null  int8   
 6   Have_Email                          190550 non-null  int8   
 7   Gift_ID                             190550 non-null  int32  
 8   GiftDate                            190550 non-null  int16  
 9   IsEmergencyGift                     190550 non-null  int8   
 10  Is_First_Gift                       190550 non-null  int8   
 11  ProductType_Group             

In [95]:
for key, value in sorted(column_to_category_dict["GiftDate"].items()): # note: calling sorted() on dictionary.items() sorts the dictionary by keys 
    print(f"{key} -> {value}")

1 -> 2014-01-01 00:00:00
2 -> 2014-01-02 00:00:00
3 -> 2014-01-03 00:00:00
4 -> 2014-01-04 00:00:00
5 -> 2014-01-05 00:00:00
6 -> 2014-01-06 00:00:00
7 -> 2014-01-07 00:00:00
8 -> 2014-01-08 00:00:00
9 -> 2014-01-09 00:00:00
10 -> 2014-01-10 00:00:00
11 -> 2014-01-11 00:00:00
12 -> 2014-01-12 00:00:00
13 -> 2014-01-13 00:00:00
14 -> 2014-01-14 00:00:00
15 -> 2014-01-15 00:00:00
16 -> 2014-01-16 00:00:00
17 -> 2014-01-17 00:00:00
18 -> 2014-01-18 00:00:00
19 -> 2014-01-19 00:00:00
20 -> 2014-01-20 00:00:00
21 -> 2014-01-21 00:00:00
22 -> 2014-01-22 00:00:00
23 -> 2014-01-23 00:00:00
24 -> 2014-01-24 00:00:00
25 -> 2014-01-25 00:00:00
26 -> 2014-01-26 00:00:00
27 -> 2014-01-27 00:00:00
28 -> 2014-01-28 00:00:00
29 -> 2014-01-29 00:00:00
30 -> 2014-01-30 00:00:00
31 -> 2014-01-31 00:00:00
32 -> 2014-02-01 00:00:00
33 -> 2014-02-02 00:00:00
34 -> 2014-02-03 00:00:00
35 -> 2014-02-04 00:00:00
36 -> 2014-02-05 00:00:00
37 -> 2014-02-06 00:00:00
38 -> 2014-02-07 00:00:00
39 -> 2014-02-08 00:0

In [96]:
for key, value in sorted(column_to_category_dict['Age_Bucket'].items()): # note: calling sorted() on dictionary.items() sorts the dictionary by keys 
    print(f"{key} -> {value}")

1 -> 19 - 30
2 -> 31-40
3 -> 41-50
4 -> 51-60
5 -> 61-70
6 -> 71+


In [97]:
total_missing_values = df_first_gift.isna().sum().sum()

if total_missing_values == 0:
    print(f"[INFO] Total missing values: {total_missing_values} - Woohoo! Let's build a model!")
else:
    print(f"[INFO] Uh ohh... total missing values: {total_missing_values} - Perhaps we might have to retrace our steps to fill the values?")

[INFO] Total missing values: 0 - Woohoo! Let's build a model!


In [98]:
df_first_gift.describe()

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,...,Gift_ID_is_missing,GiftDate_is_missing,IsEmergencyGift_is_missing,Is_First_Gift_is_missing,ProductType_Group_is_missing,GiftSolicitationChannel_is_missing,CampaignSubtype_Group_is_missing,AppealSeason_is_missing,MOSAIC_TYPE_is_missing,MOSAIC_GROUP_is_missing
count,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,...,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0,190550.0
mean,95275.5,3.372139,1.438615,3.519543,692.946864,1.604529,1.643369,95275.5,1756.543448,1.431582,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,55007.191234,1.539438,0.496557,2.943501,596.61307,0.488953,0.479006,55007.191234,968.640535,0.495298,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47638.25,2.0,1.0,1.0,154.0,1.0,1.0,47638.25,916.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,95275.5,3.0,1.0,2.0,621.0,2.0,2.0,95275.5,1749.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,142912.75,5.0,2.0,4.0,1172.0,2.0,2.0,142912.75,2692.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,190550.0,6.0,4.0,10.0,2072.0,2.0,2.0,190550.0,3198.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Fitting a machine learning model to our preprocessed data

In [99]:
X = df_first_gift.drop('ConvertedTo_RG_Within_6M',axis = 1)

y = df_first_gift['ConvertedTo_RG_Within_6M']


In [100]:
X

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,...,Gift_ID_is_missing,GiftDate_is_missing,IsEmergencyGift_is_missing,Is_First_Gift_is_missing,ProductType_Group_is_missing,GiftSolicitationChannel_is_missing,CampaignSubtype_Group_is_missing,AppealSeason_is_missing,MOSAIC_TYPE_is_missing,MOSAIC_GROUP_is_missing
0,1,6,2,1,213,1,1,129392,2500,1,...,0,0,0,0,0,0,0,0,0,0
1,2,3,1,1,86,1,1,130854,2529,1,...,0,0,0,0,0,0,0,0,0,0
2,3,5,2,10,1,1,1,131654,2542,1,...,0,0,0,0,0,0,0,0,0,0
3,4,6,2,10,1,1,1,131737,2543,1,...,0,0,0,0,0,0,0,0,0,0
4,5,2,1,2,745,1,1,117317,496,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780132,190546,1,1,1,246,2,2,125368,2422,2,...,0,0,0,0,0,0,0,0,0,0
1780133,190547,3,1,2,793,2,2,125880,2436,2,...,0,0,0,0,0,0,0,0,0,0
1780134,190548,4,1,2,687,2,2,123901,2410,2,...,0,0,0,0,0,0,0,0,0,0
1780135,190549,6,1,2,627,2,2,128499,2481,1,...,0,0,0,0,0,0,0,0,0,0


In [104]:
y

0          0
1          0
2          0
3          0
4          0
          ..
1780132    0
1780133    0
1780134    0
1780135    0
1780136    0
Name: ConvertedTo_RG_Within_6M, Length: 190550, dtype: int64

In [105]:
# Split data into train and test sets
np.random.seed(42)

# Split into train & test set
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.2)

In [106]:
X_train

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,...,Gift_ID_is_missing,GiftDate_is_missing,IsEmergencyGift_is_missing,Is_First_Gift_is_missing,ProductType_Group_is_missing,GiftSolicitationChannel_is_missing,CampaignSubtype_Group_is_missing,AppealSeason_is_missing,MOSAIC_TYPE_is_missing,MOSAIC_GROUP_is_missing
1545283,133200,2,2,5,613,2,1,132244,2563,2,...,0,0,0,0,0,0,0,0,0,0
172611,11260,2,1,3,1683,1,1,111125,224,1,...,0,0,0,0,0,0,0,0,0,0
1592389,140954,2,1,2,624,2,2,140243,2684,2,...,0,0,0,0,0,0,0,0,0,0
890715,70113,6,1,1,487,2,1,69439,1226,1,...,0,0,0,0,0,0,0,0,0,0
217453,13555,2,2,2,1059,1,1,17186,319,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453351,119880,2,1,2,675,2,2,38451,2362,2,...,0,0,0,0,0,0,0,0,0,0
1291396,103695,6,1,2,795,2,2,105891,1912,1,...,0,0,0,0,0,0,0,0,0,0
1532181,131933,3,2,10,1,2,1,130959,2531,1,...,0,0,0,0,0,0,0,0,0,0
1616384,146868,5,1,6,1516,2,2,146239,2732,2,...,0,0,0,0,0,0,0,0,0,0


In [107]:
y_train

1545283    0
172611     0
1592389    0
890715     0
217453     0
          ..
1453351    0
1291396    0
1532181    0
1616384    0
1458603    0
Name: ConvertedTo_RG_Within_6M, Length: 152440, dtype: int64

In [109]:
test_set = pd.read_csv('Test_p2.csv')
test_set

Unnamed: 0,SupporterID,Age_Bucket,Gender,State,PostCode,Have_Phone,Have_Email,Gift_ID,GiftDate,IsEmergencyGift,Is_First_Gift,ProductType_Group,GiftSolicitationChannel,CampaignSubtype_Group,AppealSeason,GiftAmount
0,C-990078102,Unknown,,NSW,2088.0,No,No,0062u000009SyPiAAK,8/01/2014,Yes,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,70.0
1,C-990116708,Unknown,,SA,5006.0,No,No,0062u000009FmpmAAC,4/27/2015,Yes,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,35.0
2,C-990132428,Unknown,Female,QLD,4007.0,Yes,Yes,0062u000009SLdHAAW,9/02/2015,,Is_First_Gift,Cash - Inspired Gifts,Web,Appeal,Non Seasonal,28.0
3,C-990137490,Unknown,Female,SA,,Yes,Yes,0062u00000C4UWoAAN,4/29/2021,Yes,Is_First_Gift,Cash - One off,EDM,Appeal,Non Seasonal,576.1
4,C-990189756,Unknown,,NSW,2777.0,Yes,Yes,0062u00000DC1KSAA1,9/04/2021,,Is_First_Gift,Cash - Inspired Gifts,EDM,Appeal,Non Seasonal,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15219,N_990621183,Unknown,,NSW,2300.0,Yes,Yes,0062u00000C5RabAAF,5/11/2021,Yes,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,35.0
15220,N_990621184,Unknown,,,,Yes,Yes,0062u000009gBTKAA2,8/07/2020,Yes,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,61.6
15221,N_990621235,Unknown,,NSW,2060.0,Yes,Yes,0062u00000Bble2AAB,3/15/2021,Yes,Is_First_Gift,Cash - One off,General,Appeal,Non Seasonal,147.0
15222,N_990621240,Unknown,,NSW,2077.0,Yes,Yes,0062u00000BbzCZAAZ,3/17/2021,,Is_First_Gift,Cash - One off,Web,Appeal,Non Seasonal,107.1
