![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Random Forests

For this lab, you will be using the CSV files provided in the `files_for_lab` folder.

### Instructions

- Apply the Random Forests algorithm but this time only by upscaling the data using `SMOTE`.
- Note that since `SMOTE` works on numerical data only, we will first encode the categorical variables in this case.


### Importing Library

In [1]:
import pandas as pd
import numpy as np

### Loading Datasets

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

In [3]:
numerical.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


In [4]:
categorical.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


In [5]:
targets.head()

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


### One Hot Encoder for catergorical datasets

In [6]:
categorical.shape

(95412, 22)

In [7]:
cat = categorical.columns

x = [j for i in cat for j in categorical[i].unique()[1:]]

In [8]:
### ONE HOT ENCODING for categorical data
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categorical)
encoded = encoder.transform(categorical).toarray()
cat_data = pd.DataFrame(encoded , columns = x)
cat_data.head()

Unnamed: 0,CA,NC,FL,other,IN,MI,MO,TX,WA,WI,...,1,2,3,9,5,4,12,6,7,8
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Concating the above data sets

In [9]:
data = pd.concat([numerical,cat_data, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

### Seperating the target variable

In [11]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

### Smote (over sampling)

In [13]:
from imblearn.over_sampling import SMOTE

# SMOTE only can handle numerical features. Therefore, if we want to use categoricals they 
# will need to be dummyfied. However, for sake of simplicity we will not include the categoricals.

data_new = pd.concat([numerical,targets],axis=1)
smote = SMOTE()
y = data_new['TARGET_B']
X = data_new.drop(['TARGET_B'], axis=1)
X_sm, y_sm = smote.fit_sample(X, y)

unique, counts = np.unique(y_sm, return_counts=True)
print(np.asarray((unique, counts)).T)

[[    0 90569]
 [    1 90569]]


### Random Forest Classifier (using oversampling data)

In [15]:
## importing Library
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [16]:
## Taking the X,Y values from smote
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=100)

#   svc = SVC(random_state=42)    svc.fit(X_train, y_train)
clf = RandomForestClassifier(random_state=56).fit(X_train, y_train.values.ravel())

In [17]:
y_pred = clf.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (clf.score(X_test, y_test)))

from sklearn.metrics import cohen_kappa_score
print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, labels=["Yes","No"]))

plot_confusion_matrix(clf, X_test, y_test)  # doctest: +SKIP
plt.show()

The accuracy of the logistic_regression model is: 1.00 
The kappa of the logistic regression model is: 1.00 


  mask &= (ar1 != a)


ValueError: y contains previously unseen labels: [0, 1]