<a href="https://colab.research.google.com/github/StavroK/MtySaturdayAI2020/blob/master/Gradient_Boosting_Machine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient Boosting Machine

References: 

https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

In [0]:
import pickle
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

First, we load the data:

In [6]:
data = []
train_df=[]
trainbodies_df = []
trainstances_df = []

trainbodies_url = 'https://raw.githubusercontent.com/StavroK/MtySaturdayAI2020/master/train_bodies.csv'
trainbodies_df = pd.read_csv(trainbodies_url)
trainstances_url = 'https://raw.githubusercontent.com/StavroK/MtySaturdayAI2020/master/train_stances.random.csv'
trainstances_df = pd.read_csv(trainstances_url)

trainbodies_df.sort_values(by='Body ID')
trainstances_df.sort_values(by='Body ID')


train_df = pd.merge(trainbodies_df,trainstances_df, on="Body ID")

train_df['articleBody_Parsed_1'] = train_df['articleBody'].str.replace("\r", " ")
train_df['articleBody_Parsed_1'] = train_df['articleBody_Parsed_1'].str.replace("\n", " ")
train_df['articleBody_Parsed_1'] = train_df['articleBody_Parsed_1'].str.replace("    ", " ")
train_df['articleBody_Parsed_1'] = train_df['articleBody_Parsed_1'].str.replace('"', '')
train_df['articleBody_Parsed_2'] = train_df['articleBody_Parsed_1'].str.lower()
punctuation_signs = list("?:!.,;")
train_df['articleBody_Parsed_3'] = train_df['articleBody_Parsed_2']

for punct_sign in punctuation_signs:
    train_df['articleBody_Parsed_3'] = train_df['articleBody_Parsed_3'].str.replace(punct_sign, '')

train_df['articleBody_Parsed_4'] = train_df['articleBody_Parsed_3'].str.replace("'s", "")
nltk.download('punkt')
wordnet_lemmatizer = WordNetLemmatizer()
nrows = len(train_df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    lemmatized_list = []
    text = train_df.loc[row]['articleBody_Parsed_4']
    text_words = text.split(" ")

    for word in text_words:
      lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    lemmatized_text = " ".join(lemmatized_list)
    lemmatized_text_list.append(lemmatized_text)
train_df['articleBody_Parsed_5'] = lemmatized_text_list
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))
train_df['articleBody_Parsed_6'] = train_df['articleBody_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    train_df['articleBody_Parsed_6'] = train_df['articleBody_Parsed_6'].str.replace(regex_stopword, '')
category_codes = {
    'discuss': 2,
    'disagree': 1,
    'agree': 0,
    'unrelated': 3
}
train_df['Content_Parsed'] = train_df['articleBody_Parsed_6']
train_df['Category_Code'] = train_df['Stance']
train_df = train_df.replace({'Category_Code':category_codes})

X_train = train_df['Content_Parsed']
y_train = train_df['Category_Code']



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
data = []
train_df=[]
trainbodies_df = []
trainstances_df = []

testbodies_url = 'https://raw.githubusercontent.com/StavroK/MtySaturdayAI2020/master/test_bodies.csv'
testbodies_df = pd.read_csv(testbodies_url)
teststances_url = 'https://raw.githubusercontent.com/StavroK/MtySaturdayAI2020/master/test_stances_unlabeled.csv'
teststances_df = pd.read_csv(teststances_url)

testbodies_df.sort_values(by='Body ID')
teststances_df.sort_values(by='Body ID')


test_df = pd.merge(testbodies_df,teststances_df, on="Body ID")

test_df['articleBody_Parsed_1'] = test_df['articleBody'].str.replace("\r", " ")
test_df['articleBody_Parsed_1'] = test_df['articleBody_Parsed_1'].str.replace("\n", " ")
test_df['articleBody_Parsed_1'] = test_df['articleBody_Parsed_1'].str.replace("    ", " ")
test_df['articleBody_Parsed_1'] = test_df['articleBody_Parsed_1'].str.replace('"', '')
test_df['articleBody_Parsed_2'] = test_df['articleBody_Parsed_1'].str.lower()
punctuation_signs = list("?:!.,;")
test_df['articleBody_Parsed_3'] = test_df['articleBody_Parsed_2']

for punct_sign in punctuation_signs:
    test_df['articleBody_Parsed_3'] = test_df['articleBody_Parsed_3'].str.replace(punct_sign, '')

test_df['articleBody_Parsed_4'] = test_df['articleBody_Parsed_3'].str.replace("'s", "")
nltk.download('punkt')
wordnet_lemmatizer = WordNetLemmatizer()
nrows = len(train_df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    lemmatized_list = []
    text = test_df.loc[row]['articleBody_Parsed_4']
    text_words = text.split(" ")

    for word in text_words:
      lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    lemmatized_text = " ".join(lemmatized_list)
    lemmatized_text_list.append(lemmatized_text)
test_df['articleBody_Parsed_5'] = lemmatized_text_list
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))
test_df['articleBody_Parsed_6'] = test_df['articleBody_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    test_df['articleBody_Parsed_6'] = test_df['articleBody_Parsed_6'].str.replace(regex_stopword, '')
category_codes = {
    'discuss': 2,
    'disagree': 1,
    'agree': 0,
    'unrelated': 3
}
test_df['Content_Parsed'] = test_df['articleBody_Parsed_6']
test_df['Category_Code'] = test_df['Stance']
test_df = train_df.replace({'Category_Code':category_codes})

X_test = train_df['Content_Parsed']
y_test = train_df['Category_Code']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ValueError: ignored

In [0]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

import numpy as np

for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

    

In [0]:
# X_train
X_train.to_csv("/content/Standarized/X_train.csv")
    
# X_test    
X_test.to_csv("/content/Standarized/X_test.csv")
    
# y_train
y_train.to_csv("/content/Standarized/y_train.csv")
    
# y_test
y_test.to_csv("/content/Standarized/y_test.csv")
    
# train_df
train_df.to_csv("/content/Standarized/train_df.csv")
    
# features_train
np.savetxt('/content/Standarized/features_train.csv',features_train,delimiter=',')

# labels_train
np.savetxt('/content/Standarized/labels_train.csv',labels_train,delimiter=',')

# features_test
np.savetxt('/content/Standarized/features_test.csv',features_train,delimiter=',')

# labels_test
np.savetxt('/content/Standarized/labels_test.csv',labels_test,delimiter=',')

Let's check the dimension of our feature vectors:

In [0]:
print(features_train.shape)
print(features_test.shape)

## Cross-Validation for Hyperparameter tuning

First, we can see what hyperparameters the model has:

In [0]:
gb_0 = GradientBoostingClassifier(random_state = 8)

print('Parameters currently in use:\n')
pprint(gb_0.get_params())

We'll tune the following ones:

Tree-related hyperparameters:
* `n_estimators` = number of trees in the forest.
* `max_features` = max number of features considered for splitting a node
* `max_depth` = max number of levels in each decision tree
* `min_samples_split` = min number of data points placed in a node before the node is split
* `min_samples_leaf` = min number of data points allowed in a leaf node

Boosting-related hyperparameters:
* `learning_rate`= learning rate shrinks the contribution of each tree by learning_rate.
* `subsample`= the fraction of samples to be used for fitting the individual base learners.

### Randomized Search Cross Validation

We first need to define the grid. Since we have a huge amount of hyperparameters, we'll try few values for each one.

In [0]:
# n_estimators
n_estimators = [200, 800]

# max_features
max_features = ['auto', 'sqrt']

# max_depth
max_depth = [10, 40]
max_depth.append(None)

# min_samples_split
min_samples_split = [10, 30, 50]

# min_samples_leaf
min_samples_leaf = [1, 2, 4]

# learning rate
learning_rate = [.1, .5]

# subsample
subsample = [.5, 1.]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate,
               'subsample': subsample}

pprint(random_grid)

Then, we'll perform the Random Search:

In [0]:
# First create the base model to tune
gbc = GradientBoostingClassifier(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=gbc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(features_train, labels_train)

We can see the best hyperparameters resulting from the Random Search:

In [0]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

After that, we can do a more exhaustive search centered in those values:

### Grid Search Cross Validation

In [0]:
# Create the parameter grid based on the results of random search 
max_depth = [5, 10, 15]
max_features = ['sqrt']
min_samples_leaf = [2]
min_samples_split = [50, 100]
n_estimators = [800]
learning_rate = [.1, .5]
subsample = [1.]

param_grid = {
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators,
    'learning_rate': learning_rate,
    'subsample': subsample

}

# Create a base model
gbc = GradientBoostingClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=gbc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

The best hyperparameters turn out to be:

In [0]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

Let's save the model in `best_gbc`:

In [0]:
best_gbc = grid_search.best_estimator_

In [0]:
best_gbc

We now know the best gradient boosting model. Let's fit it and see how it performs:

## Model fit and performance

Now, we can fit the model to our training data:

In [0]:
best_gbc.fit(features_train, labels_train)

And get the predictions:

In [0]:
gbc_pred = best_gbc.predict(features_test)

The conditional class probabilities can be obtained by typing:

`gbc_pred = best_gbc.predict_proba(features_test)`

For performance analysis, we will use the confusion matrix, the classification report and the accuracy on both training and test data:

#### Training accuracy

In [0]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_gbc.predict(features_train)))

#### Test accuracy

In [0]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, gbc_pred))

#### Classification report

In [0]:
# Classification report
print("Classification report")
print(classification_report(labels_test,gbc_pred))

#### Confusion matrix

In [0]:
aux_df = df[['Category', 'Category_Code']].drop_duplicates().sort_values('Category_Code')
conf_matrix = confusion_matrix(labels_test, gbc_pred)
plt.figure(figsize=(12.8,6))
sns.heatmap(conf_matrix, 
            annot=True,
            xticklabels=aux_df['Category'].values, 
            yticklabels=aux_df['Category'].values,
            cmap="Blues")
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.title('Confusion matrix')
plt.show()

At this point, we could get the average time the model takes to get predictions. We want the algorithm to be fast since we are creating an app which will gather data from the internet and get the predicted categories. However, since the difference when predicting 10-20 observations will be very little, we won't take this into account.

However, the code below could do this task:

```python
features_time = features_train
elapsed_list = []
for i in range(0,10):
    
    start = time.time()
    predictions = best_lrc.predict(features_time)
    end = time.time()
    elapsed = end - start
    elapsed_list.append(elapsed)

mean_time_elapsed = np.mean(elapsed_list)
```

Let's see if the hyperparameter tuning process has returned a better model:

In [0]:
base_model = GradientBoostingClassifier(random_state = 8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

In [0]:
best_gbc.fit(features_train, labels_train)
accuracy_score(labels_test, best_gbc.predict(features_test))

We'll create a dataset with a model summary to compare models:

In [0]:
d = {
     'Model': 'Gradient Boosting',
     'Training Set Accuracy': accuracy_score(labels_train, best_gbc.predict(features_train)),
     'Test Set Accuracy': accuracy_score(labels_test, gbc_pred)
}

df_models_gbc = pd.DataFrame(d, index=[0])

In [0]:
df_models_gbc

Let's save the model and this dataset:

In [0]:
with open('Models/best_gbc.pickle', 'wb') as output:
    pickle.dump(best_gbc, output)
    
with open('Models/df_models_gbc.pickle', 'wb') as output:
    pickle.dump(df_models_gbc, output)