In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
numericals_df = pd.read_csv('files_for_lab/numerical.csv')
categoricals_df = pd.read_csv('files_for_lab/categorical.csv')
targets_df = pd.read_csv('files_for_lab/target.csv')

In [3]:
concat_df = pd.concat([numericals_df,categoricals_df,targets_df], axis = 1)
concat_df.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0


In [4]:
#checking balance of the data
targets_df['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [5]:
concat_df.shape

(95412, 339)

In [6]:
#replacing nas
concat_df.dropna(axis = 0)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.000000,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,96,2,96,2,96,2,96,2,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,96,3,96,3,96,3,96,3,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,96,3,95,1,96,10,94,10,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,90,11,96,8,97,1,86,12,1,18.0


No nas found

In [7]:
#Shaping the database for the model 

y = concat_df['TARGET_B']
X = concat_df.drop(['TARGET_B'], axis = 1)

numerical_X = X.select_dtypes(np.number)
categorical_X = X.select_dtypes(object)


categorical_X.head()

Unnamed: 0,STATE,HOMEOWNR,GENDER,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A
0,IL,H,F,L,E,C,T
1,CA,H,M,L,G,A,S
2,NC,U,M,L,E,C,R
3,CA,U,F,L,E,C,R
4,FL,H,F,L,F,A,S


In [8]:
numerical_X.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,12,92,8,94,2,95,12,89,11,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,2,93,10,95,12,95,12,93,10,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,2,91,11,92,7,95,12,90,1,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,1,87,11,94,11,95,12,87,2,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,1,93,10,96,1,96,1,79,3,0.0


In [10]:
#Using the onehotencoder on the categoricals
encoder = OneHotEncoder(drop='first').fit(categorical_X)

enc_categorical = encoder.transform(categorical_X).toarray()
enc_categorical = pd.DataFrame(enc_categorical)
X = pd.concat([numerical_X, enc_categorical], axis = 1)

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [11]:
#checking for Sampling 
concat_df['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

Since the target column is very unblanaced:

In [12]:
train_set = pd.concat([X_train, y_train], axis=1)

target_1 = train_set[train_set['TARGET_B']==1].sample(len(train_set[train_set['TARGET_B']==0]), replace=True)
print(target_1.shape)

target_0 = train_set[train_set['TARGET_B']== 0 ]
trainset_new = pd.concat([target_0, target_1], axis = 0)


X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
print(X_train.shape)

(72442, 356)
(144884, 355)


Dropping the second target (TARGET_D)

In [13]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

Applying the Random Florest Classifier

In [14]:
rfc = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.9,
                             random_state = 12)
rfc.fit(X_train, y_train)

print(rfc.score(X_train, y_train))
print(rfc.score(X_test, y_test))

y_pred = rfc.predict(X_test)

display(confusion_matrix(y_test, y_pred))



0.6219458325280914




0.6120106901430593




array([[11172,  6955],
       [  449,   507]])

Checking the cross value scores

In [15]:
cross_val_scores = cross_val_score(rfc, X_train, y_train, cv=5)
cross_val_scores



array([0.61786934, 0.62332194, 0.61579874, 0.62014701, 0.62147985])

In [16]:
print(np.mean(cross_val_scores))

0.6197233770220891


In [17]:
#Checking the confusion matrix 
display(confusion_matrix(y_test, y_pred))

array([[11172,  6955],
       [  449,   507]])

A fasle positive might be more costly for the company, since the company will be spending money to engage this customer to donate. In the other hand a false negative could be positive, because receiving donations from a customer there is not expected could boost the earinings 