# Gradient Boosting
- Ensemble learning method that takes an iterative appraoch to combining weak learners to create a strong learner by focusing on mistakes of prior iterations 

In [1]:
import nltk
import string 
import re 
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
pd.set_option('display.max_colwidth', 100)

In [2]:
import os
cwd = os.getcwd()
print(cwd)

C:\Users\sbhati\OneDrive - George Weston Limited-6469347-MTCAD\sbhati\Documents\Personal\LinkedIn_NLP\Final_Project


In [3]:
path = r"C:\Users\sbhati\OneDrive - George Weston Limited-6469347-MTCAD\sbhati\Documents\Personal\LinkedIn_NLP"
data = pd.read_csv(path+'\SMSSpamCollection.tsv', sep='\t', )
data.columns = ['label', 'body_text']
data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


## Clean Data

In [4]:
# Import the SnowballStemmer, get list of punctuations
ss = nltk.SnowballStemmer('english')
punct = string.punctuation

In [5]:
# Custom Functions 
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ''.join([word.lower() for word in text if word not in string.punctuation]) # Remove punctuation
    tokens = re.split('\W+',text) # Tokenize: Split on any character that is not alphanumeric
    text = [ss.stem(word) for word in tokens if word not in stopwords] # Remove stopwords & stem
    #text = [ss.stem(word) for word in tokenized_text] # Stemming
    
    return text

### Apply TF-IDF Vectorizer

In [6]:
idf_vect = TfidfVectorizer(analyzer=clean_text)
idf_arr = idf_vect.fit_transform(data['body_text'])
idf_df = pd.DataFrame(idf_arr.toarray())
idf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8092,8093,8094,8095,8096,8097,8098,8099,8100,8101
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Engineering

In [7]:
def punct_count(text): # Count the percent of text that are punctuation
    return round((sum([1 for char in text if char in punct])/(len(text) - text.count(" ")))*100,3)

In [8]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct_percent'] = data['body_text'].apply(lambda x: punct_count(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct_percent
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,128,4.688
1,ham,"Nah I don't think he goes to usf, he lives around here though",49,4.082
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,62,3.226
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.143
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,135,4.444


In [9]:
idf_df = pd.concat([data['body_len'], data['punct_percent'], idf_df], axis=1)
idf_df.head()

Unnamed: 0,body_len,punct_percent,0,1,2,3,4,5,6,7,...,8092,8093,8094,8095,8096,8097,8098,8099,8100,8101
0,128,4.688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Gradient Boosting

In [10]:
print(dir(GradientBoostingClassifier))
print()
print(GradientBoostingClassifier().get_params())

['_SUPPORTED_LOSS', '__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_check_initialized', '_check_n_features', '_check_params', '_clear_state', '_compute_partial_dependence_recursion', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_get_tags', '_init_state', '_is_initialized', '_make_estimator', '_more_tags', '_raw_predict', '_raw_predict_init', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_resize_state', '_staged_raw_predict', '_validate_data',

### Manual GridSearch

In [11]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(idf_df, data['label'], test_size=0.2, random_state=2)

In [43]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(x_train, y_train)
    y_pred = gb_model.predict(x_test)
    prec, rec, fsc, supp = score(y_test, y_pred, pos_label='spam', average = 'binary')
    print(f'Est:{est} / Depth:{max_depth} / LR: {lr} ---- Precision: {prec} / Recall: {rec} / Accuracy: {round((y_pred==y_test).sum()/len(y_pred),3)}')

In [44]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

  _warn_prf(average, modifier, msg_start, len(result))


Est:50 / Depth:3 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.859
Est:50 / Depth:3 / LR: 0.1 ---- Precision: 0.9411764705882353 / Recall: 0.7133757961783439 / Accuracy: 0.953
Est:50 / Depth:3 / LR: 1 ---- Precision: 0.8561151079136691 / Recall: 0.7579617834394905 / Accuracy: 0.948


  _warn_prf(average, modifier, msg_start, len(result))


Est:50 / Depth:7 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.859
Est:50 / Depth:7 / LR: 0.1 ---- Precision: 0.9618320610687023 / Recall: 0.802547770700637 / Accuracy: 0.968
Est:50 / Depth:7 / LR: 1 ---- Precision: 0.8857142857142857 / Recall: 0.7898089171974523 / Accuracy: 0.956


  _warn_prf(average, modifier, msg_start, len(result))


Est:50 / Depth:11 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.859
Est:50 / Depth:11 / LR: 0.1 ---- Precision: 0.9280575539568345 / Recall: 0.821656050955414 / Accuracy: 0.966
Est:50 / Depth:11 / LR: 1 ---- Precision: 0.8865248226950354 / Recall: 0.7961783439490446 / Accuracy: 0.957
Est:50 / Depth:15 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.006369426751592357 / Accuracy: 0.86
Est:50 / Depth:15 / LR: 0.1 ---- Precision: 0.9285714285714286 / Recall: 0.8280254777070064 / Accuracy: 0.967
Est:50 / Depth:15 / LR: 1 ---- Precision: 0.9285714285714286 / Recall: 0.8280254777070064 / Accuracy: 0.967
Est:100 / Depth:3 / LR: 0.01 ---- Precision: 0.9534883720930233 / Recall: 0.5222929936305732 / Accuracy: 0.929
Est:100 / Depth:3 / LR: 0.1 ---- Precision: 0.9365079365079365 / Recall: 0.7515923566878981 / Accuracy: 0.958
Est:100 / Depth:3 / LR: 1 ---- Precision: 0.8714285714285714 / Recall: 0.7770700636942676 / Accuracy: 0.952
Est:100 / Depth:7 / LR: 0.01 ---- Precision: 0.95098039215

**Grid Search Results**

After using our manual Grid Search to test some parameters, we noticed that some paramters generally perform better 
- n_estimators: best performers were 100 & 150
- max_depth: best estimators were 7, 11, 15
- learning rate: best lr was 0.1

Using this information, we will run another GridSearch on those parameters to pick out the best of the best parameters

## GridSearchCV
- Use the best performing paramters from previous evaluation to run another GridSearch

In [13]:
from sklearn.model_selection import GridSearchCV

In [15]:
gb = GradientBoostingClassifier()

params = {'n_estimators':[100,150],
         'max_depth':[7, 11, 15]} 
# learning rate is already at 0.1 by default 

gs = GridSearchCV(gb, params, cv=5, n_jobs=-1)
gs_fit = gs.fit(idf_df, data['label'])

In [16]:
gb_performance = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

In [17]:
gb_performance.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,601.056833,1.149958,0.543868,0.026416,7,150,"{'max_depth': 7, 'n_estimators': 150}",0.970377,0.979354,0.968553,0.967655,0.967655,0.970719,0.00443,1
3,916.367813,19.772734,0.647842,0.064418,11,150,"{'max_depth': 11, 'n_estimators': 150}",0.968582,0.975763,0.969452,0.968553,0.964061,0.969282,0.00375,2
5,979.433605,13.347298,0.392164,0.101324,15,150,"{'max_depth': 15, 'n_estimators': 150}",0.968582,0.975763,0.969452,0.967655,0.96496,0.969282,0.003574,3
0,398.593339,2.002012,0.912832,0.726069,7,100,"{'max_depth': 7, 'n_estimators': 100}",0.965889,0.974865,0.97035,0.967655,0.96496,0.968744,0.00357,4
4,744.744977,26.687322,0.652401,0.119684,15,100,"{'max_depth': 15, 'n_estimators': 100}",0.966786,0.973968,0.969452,0.96496,0.966757,0.968384,0.003138,5


In [18]:
gs_fit.best_score_

0.9707188264689703

In [19]:
gs_fit.best_estimator_

GradientBoostingClassifier(max_depth=7, n_estimators=150)

In [20]:
gs_fit.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 7,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 150,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

### Comments 
The functions above help us see the hyperparamters that produce the best score for the model 

Key Notes:
- The 'score' can be changed based on the model being tested and purpose (precision, recall, etc.) 
- The it is also important to take the **time** it took to fit the model into account 