#### Instructions
+ Apply the Random Forests algorithm but this time only by upscaling the data.
+ Use Feature Selections that you have learned in class to decide if you want to use all of the features (PCA, etc)
+ Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?


#### Importing libraries

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")


In [102]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [103]:
categorical.dtypes

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

In [104]:
categorical = categorical.astype(object)

In [105]:
categorical.dtypes

STATE           object
CLUSTER         object
HOMEOWNR        object
GENDER          object
DATASRCE        object
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B        object
ODATEW_YR       object
ODATEW_MM       object
DOB_YR          object
DOB_MM          object
MINRDATE_YR     object
MINRDATE_MM     object
MAXRDATE_YR     object
MAXRDATE_MM     object
LASTDATE_YR     object
LASTDATE_MM     object
FIRSTDATE_YR    object
FIRSTDATE_MM    object
dtype: object

In [106]:
numerical.shape, categorical.shape, target.shape

((95412, 315), (95412, 22), (95412, 2))

#### Building the dataframe

In [107]:
X = pd.concat([numerical,categorical], axis = 1)
Y = target

In [108]:
X.shape

(95412, 337)

#### Train test split

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, Y['TARGET_B'], test_size = 0.25, random_state = 1)

In [110]:
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)

X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

MinMax Scaler

In [111]:
from sklearn.preprocessing import MinMaxScaler
def EscaladorMinMax(data, MinMaxtransformer):
    X_normalized = MinMaxtransformer.transform(data)
    print(X_normalized.shape)
    X_normalized = pd.DataFrame(X_normalized,columns=data.columns)
    return X_normalized

In [112]:
MinMaxtransformer = MinMaxScaler().fit(X_train_num) # Only run once

In [113]:
X_train_num_scaled = EscaladorMinMax(X_train_num, MinMaxtransformer)
X_test_num_scaled = EscaladorMinMax(X_test_num, MinMaxtransformer)

(71559, 315)
(23853, 315)


In [114]:
X_train_num_scaled.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0.0,0.587629,0.166667,1.0,0.0,0.0,0.373737,0.363636,0.20202,0.050505,...,0.121951,0.006,0.001005,0.005,0.004596,0.007104,0.945197,0.0,1.0,0.606557
1,1.4e-05,0.814433,0.5,1.0,0.0,0.0,0.424242,0.060606,0.79798,0.060606,...,0.04878,0.006,0.005025,0.01,0.000919,0.0082,0.204055,0.0,0.666667,0.196721
2,0.0,0.624862,0.666667,1.0,0.0,0.020202,0.515152,0.464646,0.333333,0.040404,...,0.487805,0.002,0.001005,0.006,0.01011,0.003237,0.104976,1.0,0.666667,0.016393
3,1.4e-05,0.793814,0.666667,1.0,0.0,0.070707,0.323232,0.464646,0.313131,0.0,...,0.121951,0.01,0.020101,0.022,0.006434,0.029973,0.214238,0.0,0.0,0.786885
4,1.4e-05,0.793814,0.666667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.146341,0.01,0.01206,0.01,0.013787,0.018733,0.987454,1.0,0.333333,0.213115


One Hot Encoding

In [115]:
from sklearn.preprocessing import OneHotEncoder
def OneHotEncoding(data, OneHotEncoder):
    encoded = OneHotEncoder.transform(data).toarray()
    onehot_encoded = pd.DataFrame(encoded)
    return onehot_encoded

In [116]:
encoder = OneHotEncoder(drop='first', handle_unknown='ignore').fit(X_train_cat) # Only run once

In [117]:
X_train_cat_encoded = OneHotEncoding(X_train_cat, encoder)
X_test_cat_encoded = OneHotEncoding(X_test_cat, encoder)

In [118]:
X_train_cat_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,304,305,306,307,308,309,310,311,312,313
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [119]:
X_train_num_scaled.shape, X_train_cat_encoded.shape

((71559, 315), (71559, 314))

#### Full train and test dataframes

In [120]:
X_train_full = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis = 1)
X_test_full = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis = 1)


In [121]:
X_train_full.shape, X_test_full.shape, Y.shape

((71559, 629), (23853, 629), (95412, 2))

#### Finding out how many people donated

In [122]:
category_0 = Y[Y['TARGET_B']== 0 ]
category_1 = Y[Y['TARGET_B']== 1 ]

In [123]:
category_1.shape

(4843, 2)

#### Upscaling the data from 4843 to 95412

## Upsampling (oversampling)

In [124]:
from sklearn.utils import resample

category_1_oversampled = resample(category_1, 
                                replace=True, 
                                n_samples = len(category_0))

In [125]:
print(category_0.shape)
print(category_1_oversampled.shape)

(90569, 2)
(90569, 2)
