# Lab | Random Forests

* Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
* Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from imblearn.over_sampling import SMOTE



In [2]:
numerical = pd.read_csv("files_for_lab/numerical.csv")
categorical = pd.read_csv("files_for_lab/categorical.csv")
target = pd.read_csv("files_for_lab/target.csv")

In [3]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Columns: 315 entries, TCODE to CLUSTER2
dtypes: float64(9), int64(306)
memory usage: 229.3 MB


In [4]:
categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   STATE         95412 non-null  object
 1   CLUSTER       95412 non-null  int64 
 2   HOMEOWNR      95412 non-null  object
 3   GENDER        95412 non-null  object
 4   DATASRCE      95412 non-null  int64 
 5   RFA_2R        95412 non-null  object
 6   RFA_2A        95412 non-null  object
 7   GEOCODE2      95412 non-null  object
 8   DOMAIN_A      95412 non-null  object
 9   DOMAIN_B      95412 non-null  int64 
 10  ODATEW_YR     95412 non-null  int64 
 11  ODATEW_MM     95412 non-null  int64 
 12  DOB_YR        95412 non-null  int64 
 13  DOB_MM        95412 non-null  int64 
 14  MINRDATE_YR   95412 non-null  int64 
 15  MINRDATE_MM   95412 non-null  int64 
 16  MAXRDATE_YR   95412 non-null  int64 
 17  MAXRDATE_MM   95412 non-null  int64 
 18  LASTDATE_YR   95412 non-null  int64 
 19  LAST

In [5]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TARGET_B  95412 non-null  int64  
 1   TARGET_D  95412 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [6]:
obj_df = categorical.select_dtypes(include=['object']).copy()

In [7]:
for col in obj_df:
    freq = obj_df.groupby(col).size() / len(obj_df)
    obj_df[col] = obj_df[col].apply(lambda x : freq[x])

In [8]:
for col in obj_df:
    categorical[col] = obj_df[col]

In [9]:
df = pd.concat([numerical, categorical], axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Columns: 337 entries, TCODE to FIRSTDATE_MM
dtypes: float64(16), int64(321)
memory usage: 245.3 MB


## Classification

In [13]:
target_B = target.iloc[:,0]

In [11]:
sc = StandardScaler()

In [12]:
df = sc.fit_transform(df)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df, target_B, random_state=42, test_size=0.30)

In [15]:
sm = SMOTE(k_neighbors = 3, random_state = 42)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [16]:
clf = RandomForestClassifier(random_state=0)

In [17]:
clf.fit(X_train_SMOTE, y_train_SMOTE)

In [18]:
print("train:", clf.score(X_train_SMOTE, y_train_SMOTE))
print("test:", clf.score(X_test, y_test))

train: 1.0
test: 0.9499021799888205


Really good accuracy for the classification with SMOTE.

## Regression

In [19]:
target_D = target.iloc[:,1]

In [20]:
reg = RandomForestRegressor(random_state=0, max_depth=4)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df, target_D, random_state=42, test_size=0.30)

In [22]:
reg.fit(X_train, y_train)

In [23]:
print("train:", reg.score(X_train, y_train))
print("test:", reg.score(X_test, y_test))

train: 0.04875613649783517
test: 0.004507287953963046


In [28]:
target_D.value_counts()

0.00     90569
10.00      941
15.00      591
20.00      577
5.00       503
         ...  
18.25        1
10.70        1
2.50         1
16.87        1
44.21        1
Name: TARGET_D, Length: 71, dtype: int64

The regression scores are very low, assuming because of the huge amount of 0s.