#  Random Forest

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Loading Data
data = pd.read_csv("TrainData.csv")
data = data.drop(["Unnamed: 0"],axis=1)

X = data.drop(["salary_label"],axis=1)
y = data["salary_label"]

# Training and validating using GridsearchCV

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, mean_squared_error


descision_tree = Pipeline([('scaler',  MinMaxScaler()),
            ('forest', RandomForestClassifier())])



parameters = {

    'forest__max_leaf_nodes': [x for x in [400,600,800,1000,1200,1400]],
    'forest__max_depth': [x for x in [30]],
    'forest__min_samples_split' : [2],
    'forest__min_samples_leaf' : [10],
    'forest__n_estimators' : [10,20,40,60,80,100,150,200]
}

grid_search = GridSearchCV(descision_tree, param_grid = parameters, scoring = 'accuracy', cv=5, return_train_score = True)
grid_search.fit(X, y)

print(grid_search.best_params_)

{'forest__max_depth': 30, 'forest__max_leaf_nodes': 800, 'forest__min_samples_leaf': 10, 'forest__min_samples_split': 2, 'forest__n_estimators': 100}


# General performance on train/validation dataset

In [6]:
print("=========Train Acurracy=========")
print(np.mean(grid_search.cv_results_["split0_train_score"]))
print(np.mean(grid_search.cv_results_["split1_train_score"]))
print(np.mean(grid_search.cv_results_["split2_train_score"]))
print(np.mean(grid_search.cv_results_["split3_train_score"]))
print("mean:")
print(np.mean(
    [
        np.mean(grid_search.cv_results_["split0_train_score"]),
        np.mean(grid_search.cv_results_["split1_train_score"]),
        np.mean(grid_search.cv_results_["split2_train_score"]),
        np.mean(grid_search.cv_results_["split3_train_score"])
    ]
))


print("=========Test Acurracy=========")
print(np.mean(grid_search.cv_results_["split0_test_score"]))
print(np.mean(grid_search.cv_results_["split1_test_score"]))
print(np.mean(grid_search.cv_results_["split2_test_score"]))
print(np.mean(grid_search.cv_results_["split3_test_score"]))
print("mean:")
print(np.mean(
    [
        np.mean(grid_search.cv_results_["split0_test_score"]),
        np.mean(grid_search.cv_results_["split1_test_score"]),
        np.mean(grid_search.cv_results_["split2_test_score"]),
        np.mean(grid_search.cv_results_["split3_test_score"])
    ]
))

0.48687547772913625
0.48044784935028834
0.4845042040163991
0.4885554513563266
mean:
0.4850957456130376
0.3842013888888889
0.41878472222222224
0.42548611111111106
0.41451119454874147
mean:
0.4107458541927409


# Final test on test data set

In [5]:
from sklearn.metrics import accuracy_score
df = pd.read_csv("TestData.csv")

X_train = X
y_train = y

X_test = df.drop(["salary_label","Unnamed: 0"],axis=1)
y_test = df["salary_label"]


scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

params = {'max_depth': 30, 'max_leaf_nodes': 800, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 100}

model = RandomForestClassifier(**params)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

test_acc = accuracy_score(y_test,y_pred)

print("=======Accuracy on test data set=======")
print(test_acc)

0.438
