In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 

/kaggle/input/titactoe/tic-tac-toe.csv
/kaggle/input/the-ultimate-halloween-candy-power-ranking/candy-data.csv


## Basic Modeling in scikit-learn

### Regression models

In the candy dataset, the outcome is a continuous variable describing how often the candy was chosen over another candy in a series of 1-on-1 match-ups. To predict this value (the win-percentage), you will use a regression model.

In [3]:
# import data
candies = pd.read_csv('../input/the-ultimate-halloween-candy-power-ranking/candy-data.csv')
candies.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [4]:
# create training data, labels and split into train and test sets
from sklearn.model_selection import train_test_split

candies_X = candies.drop(['winpercent','competitorname'], axis=1)
candies_y = candies['winpercent']

X_train, X_test, y_train, y_test = train_test_split(candies_X, candies_y, test_size=0.33, random_state=1111)

#### Set parameters and fit a model

In [5]:
from sklearn.ensemble import RandomForestRegressor

# instantiate model and set parameters
rfr = RandomForestRegressor()
rfr.n_estimators = 50
rfr.max_depth = 10
rfr.random_state = 1111

# fit the model
rfr.fit(X_train,y_train)

RandomForestRegressor(max_depth=10, n_estimators=50, random_state=1111)

Updating parameters after the model is initialized is helpful when you need to update parameters later

#### Feature importances
Which variables had the biggest impact? You can check how important each variable was in the model by looping over the feature importance array using enumerate().

In [6]:
# Print how important each column is to the model
for i, item in enumerate(rfr.feature_importances_):
      # Use i and item to print out the feature importance of each column
    print(f"{X_train.columns[i]}: {item.round(2)}")

chocolate: 0.46
fruity: 0.03
caramel: 0.01
peanutyalmondy: 0.04
nougat: 0.0
crispedricewafer: 0.01
hard: 0.01
bar: 0.02
pluribus: 0.02
sugarpercent: 0.21
pricepercent: 0.17


In [7]:
y_pred = rfr.predict(X_test)
y_pred

array([51.7994252 , 74.24199692, 47.17086823, 40.77008747, 40.398765  ,
       63.63481599, 69.686656  , 62.66550588, 39.58195998, 46.59214783,
       54.18810166, 50.53322786, 65.74910948, 68.78238519, 45.73179682,
       34.53633357, 46.04933601, 66.06938137, 46.39510062, 57.22623108,
       38.99535844, 68.78238519, 75.87820292, 32.27741222, 54.05780172,
       60.54649553, 55.1304664 , 55.22942462, 68.26581298])

#### Accuracy metrics: regression models
Communicating modeling results can be difficult. However, most clients understand that on average, a predictive model was off by some number. This makes explaining the mean absolute error easy. For example, when predicting the number of wins for a basketball team, if you predict 42, and they end up with 40, you can easily explain that the error was two wins.

In [8]:
from sklearn.metrics import mean_absolute_error

# Manually calculate the MAE
n = len(y_pred)
mae_one = sum(abs(y_test - y_pred)) / n
print('With a manual calculation, the error is {}'.format(mae_one))

# Use scikit-learn to calculate the MAE
mae_two = mean_absolute_error(y_test, y_pred)
print('Using scikit-learn, the error is {}'.format(mae_two))

With a manual calculation, the error is 10.114301059658457
Using scikit-learn, the error is 10.11430105965846


If you use the MAE, this accuracy metric does not reflect the bad predictions as much as if you use the MSE. Squaring the large errors from bad predictions will make the accuracy look worse.

In [9]:
from sklearn.metrics import mean_squared_error

n = len(y_pred)
# Finish the manual calculation of the MSE
mse_one = sum((y_test - y_pred)**2) / n
print('With a manual calculation, the error is {}'.format(mse_one))

# Use the scikit-learn function to calculate MSE
mse_two = mean_squared_error(y_test, y_pred)
print('Using scikit-learn, the error is {}'.format(mse_two))

With a manual calculation, the error is 152.34455653729066
Using scikit-learn, the error is 152.34455653729066


### Classification models
Methods .predict() and .predict_proba() using the tic_tac_toe dataset. The first method will give a prediction of whether Player One will win the game, and the second method will provide the probability of Player One winning. Use rfc as the random forest classification model.

In [10]:
tic_tac_toe = pd.read_csv('../input/titactoe/tic-tac-toe.csv')
tic_tac_toe.head()

Unnamed: 0,Top-Left,Top-Middle,Top-Right,Middle-Left,Middle-Middle,Middle-Right,Bottom-Left,Bottom-Middle,Bottom-Right,Class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [11]:
tic_tac_toe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Top-Left       958 non-null    object
 1   Top-Middle     958 non-null    object
 2   Top-Right      958 non-null    object
 3   Middle-Left    958 non-null    object
 4   Middle-Middle  958 non-null    object
 5   Middle-Right   958 non-null    object
 6   Bottom-Left    958 non-null    object
 7   Bottom-Middle  958 non-null    object
 8   Bottom-Right   958 non-null    object
 9   Class          958 non-null    object
dtypes: object(10)
memory usage: 75.0+ KB


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# create training data, labels
tic_tac_toe_X = tic_tac_toe.drop('Class', axis=1)
tic_tac_toe_y = tic_tac_toe['Class']

# encode categorical training data and labels
# another method to perform same processing is pd.get_dummies (shown later)
ohe = OneHotEncoder()
le = LabelEncoder()

tic_tac_toe_enc_X = ohe.fit_transform(tic_tac_toe_X)
tic_tac_toe_enc_y = le.fit_transform(tic_tac_toe_y)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(tic_tac_toe_enc_X, tic_tac_toe_enc_y, test_size=0.2, random_state=1111)

In [13]:
from sklearn.ensemble import RandomForestClassifier

# instantiate model and set parameters
rfc = RandomForestClassifier()
rfc.n_estimators = 50
rfc.max_depth = 10
rfc.random_state = 1111

# fit the model
rfc.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, n_estimators=50, random_state=1111)

#### Classification predictions

In [14]:
# Create arrays of predictions
classification_predictions = rfc.predict(X_test)
probability_predictions = rfc.predict_proba(X_test)

# Print out count of binary predictions
print(pd.Series(classification_predictions).value_counts())

# Print the first value from probability_predictions
print('The first predicted probabilities are: {}'.format(probability_predictions[0]))

1    135
0     57
dtype: int64
The first predicted probabilities are: [0.16383333 0.83616667]


In [15]:
# Create predictions on X_test
predictions = rfc.predict(X_test)
print(predictions[0:5])

# Print model accuracy using score() and the testing data
print(rfc.score(X_test, y_test))

[1 1 1 0 1]
0.984375


#### Reusing model parameters

In [16]:
# Print the classification model
print(rfc)

# Print the classification model's random state parameter
print('The random state is: {}'.format(rfc.random_state))

# Print all parameters
print('Printing the parameters dictionary: {}'.format(rfc.get_params()))

RandomForestClassifier(max_depth=10, n_estimators=50, random_state=1111)
The random state is: 1111
Printing the parameters dictionary: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 1111, 'verbose': 0, 'warm_start': False}


#### Classification metrics

In [17]:
from sklearn.metrics import confusion_matrix

# Create predictions
test_predictions = rfc.predict(X_test)

# Create and print the confusion matrix
cm = confusion_matrix(y_test, test_predictions)
print(cm)

# Print the true positives (actual 1s that were predicted 1s)
print("The number of true positives is: {}".format(cm[1, 1]))

[[ 57   3]
 [  0 132]]
The number of true positives is: 132


In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f"Accuracy  : {accuracy_score(y_test, test_predictions )}")
print(f"Precision : {precision_score(y_test, test_predictions )}")
print(f"Recall    : {recall_score(y_test, test_predictions )}")

Accuracy  : 0.984375
Precision : 0.9777777777777777
Recall    : 1.0


## Validation Basics

### Creating train, test, and validation datasets

#### Create one holdout set 

In [19]:
from sklearn.model_selection import train_test_split

# Create dummy variables using pandas
tic_tac_toe_X = pd.get_dummies(tic_tac_toe.iloc[:,0:9])
tic_tac_toe_y = tic_tac_toe.iloc[:, 9]

# Create training and testing datasets. Use 10% for the test set
X_train, X_test, y_train, y_test = train_test_split(tic_tac_toe_X, tic_tac_toe_y, test_size=0.1, random_state=1111)

#### Create two holdout sets

In [20]:
# Create temporary training and final testing datasets
X_temp, X_test, y_temp, y_test  = train_test_split(tic_tac_toe_X, tic_tac_toe_y, test_size=0.2, random_state=1111)

# Create the final training and validation datasets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1111)

### Accuracy metrics: regression models

In [21]:
# See above regression model metrics

### Classification metrics

In [22]:
# See above classification model metrics

## Bias-Variance trade-off

#### Error due to under/over-fitting
Using too many features (columns) in a random forest model can lead to overfitting. A feature represents which columns of the data are used in a decision tree. The parameter max_features limits the number of features available.

In [23]:
# Refresh data
X_train, X_test, y_train, y_test = train_test_split(candies_X, candies_y, test_size=0.2, random_state=1111)

In [24]:
num_features = [2,4,11]

for num in num_features:
    rfr = RandomForestRegressor(n_estimators=25,random_state=1111,max_features=num)
    rfr.fit(X_train, y_train)
    
    train_mae = mean_absolute_error(y_train, rfr.predict(X_train))
    test_mae = mean_absolute_error(y_test, rfr.predict(X_test))
    
    print(f'max_features={num}')
    print(f'The training error is {train_mae.round(2)}')
    print(f'The testing error is {test_mae.round(2)}\n')

max_features=2
The training error is 3.9
The testing error is 9.15

max_features=4
The training error is 3.6
The testing error is 8.79

max_features=11
The training error is 3.59
The testing error is 10.0



#### Am I underfitting?
The more trees you use, the longer your random forest model will take to run. However, if you don't use enough trees, you risk underfitting

In [25]:
# Refresh data
X_train, X_test, y_train, y_test = train_test_split(tic_tac_toe_enc_X, tic_tac_toe_enc_y, test_size=0.2, random_state=1111)

In [26]:
from sklearn.metrics import accuracy_score

trees = [1, 2, 3, 4, 5, 10, 20, 50]
test_scores, train_scores = [], []

for i in trees:
    rfc = RandomForestClassifier(n_estimators=i, random_state=1111)
    rfc.fit(X_train, y_train)
    
    train_predictions = rfc.predict(X_train)
    test_predictions = rfc.predict(X_test)
    
    train_scores.append(round(accuracy_score(y_train, train_predictions), 2))
    test_scores.append(round(accuracy_score(y_test, test_predictions), 2))
    
# Print the train and test scores.
print(f"The training scores were: {train_scores}")
print(f"The testing scores were: {test_scores}")

The training scores were: [0.94, 0.93, 0.98, 0.97, 0.99, 1.0, 1.0, 1.0]
The testing scores were: [0.83, 0.79, 0.89, 0.91, 0.91, 0.93, 0.97, 0.98]


Notice that with only one tree, both the train and test scores are low. As you add more trees, both errors improve. Even at 50 trees, this still might not be enough. Every time you use more trees, you achieve higher accuracy. At some point though, more trees increase training time, but do not decrease testing error.

## Cross Validation
Holdout sets are a great start to model validation. However, using a single train and test set if often not enough. Cross-validation is considered the gold standard when it comes to validating model performance and is almost always used when tuning model hyper-parameters

### The problems with holdout sets

In [27]:
# Create two different samples of 200 observations 
sample1 = tic_tac_toe.sample(200, random_state=1111)
sample2 = tic_tac_toe.sample(200, random_state=1171)

# Print the number of common observations 
print(len([index for index in sample1.index if index in sample2.index]))

# Print the number of observations in the Class column for both samples 
print(sample1['Class'].value_counts())
print(sample2['Class'].value_counts())

40
positive    134
negative     66
Name: Class, dtype: int64
positive    123
negative     77
Name: Class, dtype: int64


Notice that there are a varying number of positive observations for both sample test sets. Sometimes creating a single test holdout sample is not enough to achieve the high levels of model validation you want. You need to use something more robust.

### Cross Validation

#### KFold

In [28]:
# Refresh data
X_train, X_test, y_train, y_test = train_test_split(candies_X, candies_y, test_size=0.2, random_state=1111)

In [29]:
from sklearn.model_selection import KFold

# Use KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1111)

# Create splits
splits = kf.split(candies_X)

# Print the number of indices
for train_index, val_index in splits:
    print("Number of training indices: %s" % len(train_index))
    print("Number of validation indices: %s" % len(val_index))

Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17


KFold() is a great method for accessing individual indices when completing cross-validation. One drawback is needing a for loop to work through the indices though. In the next lesson, you will look at an automated method for cross-validation using sklearn.

#### sklearn's cross_val_score()

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

rfr = RandomForestRegressor(n_estimators=25, random_state=1111)
mse = make_scorer(mean_squared_error)

# Set up cross_val_score
cv = cross_val_score(estimator=rfr, X=candies_X, y=candies_y, cv=10, scoring=mse)

# Print the mean error
print(cv.mean())

155.4061992697056


You now have a baseline score to build on. If you decide to build additional models or try new techniques, you should try to get an error lower than 155.4. Lower errors indicate that your popularity predictions are improving.

### Leave-one-out-cross-validation (LOOCV)
Using 5-fold cross-validation will train on only 80% of the data at a time. The candy dataset only has 85 rows though, and leaving out 20% of the data could hinder our model. However, using leave-one-out-cross-validation allows us to make the most out of our limited dataset and will give you the best estimate for your favorite candy's popularity!

In [31]:
from sklearn.metrics import mean_absolute_error, make_scorer

# Create scorer
mae_scorer = make_scorer(mean_absolute_error)

rfr = RandomForestRegressor(n_estimators=15, random_state=1111)

# Implement LOOCV
scores = cross_val_score(rfr, X=candies_X, y=candies_y, cv=candies_X.shape[0], scoring=mae_scorer)

# Print the mean and standard deviation
print("The mean of the errors is: %s." % np.mean(scores))
print("The standard deviation of the errors is: %s." % np.std(scores))

The mean of the errors is: 9.52044832324183.
The standard deviation of the errors is: 7.349020637882744.


## Selecting the best model with Hyperparameter tuning

#### Creating Hyperparameters

In [32]:
# Review the parameters of rfr
print(rfr.get_params())

# Maximum Depth
max_depth = [4, 8, 12]

# Minimum samples for a split
min_samples_split = [2, 5, 10]

# Max features 
max_features = [4, 6, 8, 10]

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 15, 'n_jobs': None, 'oob_score': False, 'random_state': 1111, 'verbose': 0, 'warm_start': False}


#### Running a model using ranges

In [34]:
import random

# Fill in rfr using your variables
rfr = RandomForestRegressor(n_estimators=100, max_depth=random.choice(max_depth), min_samples_split=random.choice(min_samples_split), max_features=random.choice(max_features))

# Print out the parameters
print(rfr.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 12, 'max_features': 6, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


### RandomizedSearchCV

#### Preparing for RandomizedSearch

In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error

# Finish the dictionary by adding the max_depth parameter
param_dist = {"max_depth": [2, 4, 6, 8],
              "max_features": [2, 4, 6, 8, 10],
              "min_samples_split": [2, 4, 8, 16]}

# Create a random forest regression model
rfr = RandomForestRegressor(n_estimators=10, random_state=1111)

# Create a scorer to use (use the mean squared error)
scorer = make_scorer(mean_squared_error)

To use RandomizedSearchCV(), you need a distribution dictionary, an estimator, and a scorer—once you've got these, you can run a random search to find the best parameters for your model.

In [44]:
# Import the method for random search
from sklearn.model_selection import RandomizedSearchCV

# Build a random search using param_dist, rfr, and scorer
random_search = RandomizedSearchCV(estimator=rfr, param_distributions=param_dist, n_iter=10, cv=5, scoring=scorer)

random_search.fit(candies_X, candies_y)


print(random_search.cv_results_)

print(random_search.cv_results_['mean_test_score'])

print(random_search.best_score_)

{'mean_fit_time': array([0.01615925, 0.01546402, 0.01534762, 0.0155334 , 0.01499014,
       0.01498542, 0.01497331, 0.01540751, 0.01467724, 0.01463027]), 'std_fit_time': array([1.84429040e-03, 1.68079234e-03, 6.91780250e-04, 8.12662342e-04,
       9.60586532e-05, 3.30557901e-04, 7.16297672e-04, 5.32460515e-04,
       4.65305493e-04, 5.75068013e-04]), 'mean_score_time': array([0.0033257 , 0.00321732, 0.00327063, 0.00338383, 0.00340376,
       0.00360098, 0.00339427, 0.00320673, 0.00320635, 0.0032094 ]), 'std_score_time': array([2.84736193e-04, 6.23535650e-05, 5.37715037e-05, 2.09387574e-04,
       4.14366020e-04, 5.62553447e-04, 3.33057310e-04, 4.14250480e-05,
       5.14362543e-05, 1.75948388e-05]), 'param_min_samples_split': masked_array(data=[2, 4, 8, 8, 8, 4, 16, 4, 4, 2],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'param_max_features': masked_array(data=[10, 4, 8, 6, 