In [2]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy.stats as stats
from sklearn import metrics
warnings.simplefilter('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
ds = pd.read_csv("/content/drive/MyDrive/ІТ Смарт-систем/ПР4/variant3_Processed_Version.csv")
# ds = ds.drop(['Name','RescuerID','Description','PetID'], axis=1)

In [5]:
# ds.to_csv("/content/drive/MyDrive/ІТ Смарт-систем/ПР4/variant3_Processed_Version.csv")


### Define feature and target columns

In [6]:
from sklearn.model_selection import train_test_split

X = ds.drop(['AdoptionSpeed','Unnamed: 0'], axis=1)
Y = ds['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(X,Y, train_size=0.8)

#### Building a Baseline Random Forest Model
Here, we will first start by building a baseline random forest model that will serve as a baseline for comparative purpose with the model using the optimal set of hyperparameters.

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

In [9]:
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', max_depth = 20,verbose = 3)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
              precision    recall  f1-score   support

           0       0.20      0.11      0.14         9
           1       0.29      0.32      0.30        53
           2       0.15      0.23      0.18        52
           3       0.17      0.10      0.12        61
           4       0.39      0.38      0.38        77

    accuracy                           0.26       252
   macro avg       0.24      0.23      0.23       252
weighted avg       0.26      0.26      0.25       252



In [10]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
print('test set metrics: ', metrics.classification_report(y_test, y_pred))

test set metrics:                precision    recall  f1-score   support

           0       0.20      0.11      0.14         9
           1       0.35      0.25      0.29        53
           2       0.19      0.33      0.24        52
           3       0.34      0.23      0.27        61
           4       0.50      0.51      0.50        77

    accuracy                           0.33       252
   macro avg       0.32      0.28      0.29       252
weighted avg       0.36      0.33      0.34       252



### Show all posibble parameters for this model

In [12]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [13]:
y_train

1150    4
1007    1
703     4
920     4
649     4
       ..
997     0
297     4
1190    4
279     4
915     3
Name: AdoptionSpeed, Length: 1008, dtype: int64

#### Hyperparameter Tuning
Now we will be performing the tuning of hyperparameters of the random forest model. 
n_estimators = number of trees in the foreset
max_features = max number of features considered for splitting a node
max_depth = max number of levels in each decision tree
min_samples_split = min number of data points placed in a node before the node is split
min_samples_leaf = min number of data points allowed in a leaf node
bootstrap = method for sampling data points (with or without replacement)


In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': np.arange(25,55,10),
               'max_features': [0.5, 0.6, 0.8],
               'min_samples_split': [10,15],
               'min_samples_leaf': [3,4],
               'bootstrap': [False]
}

rf = RandomForestClassifier(criterion='entropy')

grid = GridSearchCV(estimator=rf,
                    param_grid=param_grid,
                    scoring='accuracy',
                    cv=5,
                    verbose = 3,
                    return_train_score=True)

grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.923, test=0.342) total time=   0.2s
[CV 2/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.898, test=0.302) total time=   0.2s
[CV 3/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.892, test=0.302) total time=   0.3s
[CV 4/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.900, test=0.328) total time=   0.3s
[CV 5/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.881, test=0.299) total time=   0.3s
[CV 1/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=35;, score=(train=0.921, test=0.322) tot

### Training models with modified hyperparameters

In [34]:
Y_pred1 = model.predict(X_test)
print('Original model f1-score: ', metrics.accuracy_score(y_test, Y_pred1))

better_model = RandomForestClassifier(bootstrap= False, max_features = 0.5, min_samples_leaf= 3, min_samples_split= 15, n_estimators= 45)
better_model.fit(X_train, y_train)
Y_pred2 = better_model.predict(X_test)
print('Better model f1-score: ', metrics.accuracy_score(y_test, Y_pred2))

Original model f1-score:  0.3333333333333333
Better model f1-score:  0.30158730158730157


In [35]:
X_train

Unnamed: 0,Age,Quantity,Fee,State,VideoAmt,PhotoAmt,MaturitySize_1.0,MaturitySize_2.0,MaturitySize_3.0,MaturitySize_4.0,...,Vaccinated_3,Dewormed_1,Dewormed_2,Dewormed_3,Sterilized_1,Sterilized_2,Sterilized_3,Health_1,Health_2,Health_3
1150,0.09375,0.052632,0.000000,41336,0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1007,0.06250,0.000000,0.000000,41401,0,5.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
703,0.06250,0.000000,0.000000,41326,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
920,0.09375,0.052632,0.000000,41401,0,8.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
649,0.03125,0.263158,0.000000,41326,0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,0.06250,0.000000,0.000000,41401,0,2.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
297,0.03125,0.263158,0.000000,41326,0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1190,0.03125,0.263158,0.000000,41330,0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
279,0.06250,0.000000,0.133333,41326,0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [36]:
Y.value_counts()

4    335
2    334
3    269
1    268
0     54
Name: AdoptionSpeed, dtype: int64

# Random Over-sampling

Random over-sampling consists in extracting at random samples from the minority class, until they reach a certain proportion compared to the majority class, typically 50:50, or in other words, a balancing ratio of 1.

In Random over-sampling, we extract samples from the minority class at random, with replacement.

In [37]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(
    sampling_strategy='auto', # samples only the minority class
    random_state=0,  # for reproducibility
)

X_res, Y_res = ros.fit_resample(X, Y)

In [38]:
Y_res.value_counts()

1    335
4    335
3    335
2    335
0    335
Name: AdoptionSpeed, dtype: int64

In [40]:
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', max_depth = 20,verbose = 3)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
              precision    recall  f1-score   support

           0       0.17      0.11      0.13         9
           1       0.32      0.28      0.30        53
           2       0.20      0.37      0.26        52
           3       0.35      0.25      0.29        61
           4       0.44      0.35      0.39        77

    accuracy                           0.31       252
   macro avg       0.30      0.27      0.27       252
weighted avg       0.33      0.31      0.31       252

