#### Instructions
+ Apply the Random Forests algorithm but this time only by upscaling the data.
+ Use Feature Selections that you have learned in class to decide if you want to use all of the features (PCA, etc)
+ Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?


#### Importing libraries

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")


In [31]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [32]:
categorical.dtypes

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

In [33]:
categorical = categorical.astype(object)

In [34]:
categorical.dtypes

STATE           object
CLUSTER         object
HOMEOWNR        object
GENDER          object
DATASRCE        object
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B        object
ODATEW_YR       object
ODATEW_MM       object
DOB_YR          object
DOB_MM          object
MINRDATE_YR     object
MINRDATE_MM     object
MAXRDATE_YR     object
MAXRDATE_MM     object
LASTDATE_YR     object
LASTDATE_MM     object
FIRSTDATE_YR    object
FIRSTDATE_MM    object
dtype: object

In [35]:
numerical.shape, categorical.shape, target.shape

((95412, 315), (95412, 22), (95412, 2))

#### Building the dataframe

In [36]:
X = pd.concat([numerical,categorical], axis = 1)
Y = target

In [37]:
X.shape

(95412, 337)

#### Train test split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, Y['TARGET_B'], test_size = 0.2, random_state = 1)

In [39]:
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)

X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

MinMax Scaler

In [40]:
from sklearn.preprocessing import MinMaxScaler
def EscaladorMinMax(data, MinMaxtransformer):
    X_normalized = MinMaxtransformer.transform(data)
    print(X_normalized.shape)
    X_normalized = pd.DataFrame(X_normalized,columns=data.columns)
    return X_normalized

In [41]:
MinMaxtransformer = MinMaxScaler().fit(X_train_num) # Only run once

In [42]:
X_train_num_scaled = EscaladorMinMax(X_train_num, MinMaxtransformer)
X_test_num_scaled = EscaladorMinMax(X_test_num, MinMaxtransformer)

(76329, 315)
(19083, 315)


In [43]:
X_train_num_scaled.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,1.4e-05,0.42268,0.666667,0.888889,0.004149,0.020202,0.181818,0.494949,0.070707,0.030303,...,0.146341,0.006,0.007035,0.011,0.019301,0.01197,0.250609,0.0,0.0,0.262295
1,0.0,0.624862,0.666667,1.0,0.0,0.0,0.232323,0.292929,0.373737,0.050505,...,0.0,0.004,0.005025,0.01,0.005515,0.01296,0.098651,1.0,0.666667,0.704918
2,1.4e-05,0.608247,0.166667,1.0,0.049793,0.0,0.212121,0.585859,0.0,0.070707,...,0.243902,0.006,0.007035,0.012,0.001838,0.010591,0.476022,0.0,0.333333,0.065574
3,1.4e-05,0.319588,0.333333,1.0,0.0,0.040404,0.414141,0.494949,0.252525,0.10101,...,0.02439,0.03,0.01005,0.015,0.008272,0.027145,0.790496,0.0,0.0,0.131148
4,1.4e-05,0.624862,0.166667,0.222222,0.0,0.0,0.272727,0.474747,0.191919,0.020202,...,0.02439,0.05,0.020101,0.025,0.008272,0.046939,0.30999,1.0,0.0,0.704918


One Hot Encoding

In [44]:
from sklearn.preprocessing import OneHotEncoder
def OneHotEncoding(data, OneHotEncoder):
    encoded = OneHotEncoder.transform(data).toarray()
    onehot_encoded = pd.DataFrame(encoded)
    return onehot_encoded

In [45]:
encoder = OneHotEncoder(drop='first', handle_unknown='ignore').fit(X_train_cat) # Only run once

In [46]:
X_train_cat_encoded = OneHotEncoding(X_train_cat, encoder)
X_test_cat_encoded = OneHotEncoding(X_test_cat, encoder)

In [47]:
X_train_cat_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,304,305,306,307,308,309,310,311,312,313
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
X_train_num_scaled.shape, X_train_cat_encoded.shape

((76329, 315), (76329, 314))

#### Full train and test dataframes

In [49]:
X_train_full = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis = 1)
X_test_full = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis = 1)


In [50]:
X_train_full.shape, X_test_full.shape, Y.shape

((76329, 629), (19083, 629), (95412, 2))

In [51]:
y_train.value_counts()

0    72464
1     3865
Name: TARGET_B, dtype: int64

#### Finding out how many people donated

In [52]:
trainset = pd.concat([X_train, y_train], axis=1)

In [53]:
category_0 = trainset[trainset['TARGET_B']== 0 ]
category_1 = trainset[trainset['TARGET_B']== 1 ]

In [54]:
category_1.shape

(3865, 338)

#### Upscaling the data from 3865 to 72464

## Upsampling (oversampling)

In [55]:
from sklearn.utils import resample

category_1_oversampled = resample(category_1, 
                                replace=True, 
                                n_samples = len(category_0))

In [56]:
print(category_0.shape)
print(category_1_oversampled.shape)

(72464, 338)
(72464, 338)


## Random Forest

In [57]:
category_0 = trainset[trainset['TARGET_B']==0].sample(len(trainset[trainset['TARGET_B']==1]))
category_1 = trainset[trainset['TARGET_B']== 1 ]
trainset_new = pd.concat([category_0, category_1], axis = 0)
trainset_new = trainset_new.sample(frac =1) #randomize the rows
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,   # max number of questions to ask
                            min_samples_split=20,   # amount of rows still considered at every question
                            min_samples_leaf =20,   # ultimate answer based on at least this many rows
                            max_samples=0.8,    # fraction of X-train to use in each tree
                            random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

ValueError: could not convert string to float: 'NC'