#### Instructions
+ Apply the Random Forests algorithm but this time only by upscaling the data.
+ Use Feature Selections that you have learned in class to decide if you want to use all of the features (PCA, etc)
+ Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?


#### Importing libraries

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")


In [52]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [53]:
categorical.dtypes

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

In [54]:
categorical = categorical.astype(object)

In [55]:
categorical.dtypes

STATE           object
CLUSTER         object
HOMEOWNR        object
GENDER          object
DATASRCE        object
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B        object
ODATEW_YR       object
ODATEW_MM       object
DOB_YR          object
DOB_MM          object
MINRDATE_YR     object
MINRDATE_MM     object
MAXRDATE_YR     object
MAXRDATE_MM     object
LASTDATE_YR     object
LASTDATE_MM     object
FIRSTDATE_YR    object
FIRSTDATE_MM    object
dtype: object

In [56]:
numerical.shape, categorical.shape, target.shape

((95412, 315), (95412, 22), (95412, 2))

#### Building the dataframe

In [57]:
X = pd.concat([numerical,categorical], axis = 1)
Y = target

In [58]:
X.shape

(95412, 337)

#### Train test split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, Y['TARGET_B'], test_size = 0.25, random_state = 1)

In [60]:
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)

X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

MinMax Scaler

In [61]:
from sklearn.preprocessing import MinMaxScaler
def EscaladorMinMax(data, MinMaxtransformer):
    X_normalized = MinMaxtransformer.transform(data)
    print(X_normalized.shape)
    X_normalized = pd.DataFrame(X_normalized,columns=data.columns)
    return X_normalized

In [62]:
MinMaxtransformer = MinMaxScaler().fit(X_train_num) # Only run once

In [63]:
X_train_num_scaled = EscaladorMinMax(X_train_num, MinMaxtransformer)
X_test_num_scaled = EscaladorMinMax(X_test_num, MinMaxtransformer)

(71559, 315)
(23853, 315)


One Hot Encoding

In [64]:
from sklearn.preprocessing import OneHotEncoder
def OneHotEncoding(data, OneHotEncoder):
    encoded = OneHotEncoder.transform(data).toarray()
    onehot_encoded = pd.DataFrame(encoded)
    return onehot_encoded

In [65]:
encoder = OneHotEncoder(drop='first', handle_unknown='ignore').fit(X_train_cat) # Only run once

In [66]:
X_train_cat_encoded = OneHotEncoding(X_train_cat, encoder)
X_test_cat_encoded = OneHotEncoding(X_test_cat, encoder)

In [67]:
X_train_cat_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,304,305,306,307,308,309,310,311,312,313
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [68]:
X_train_num_scaled.shape, X_train_cat_encoded.shape

((71559, 315), (71559, 314))