<a href="https://colab.research.google.com/github/VienneseWaltz/ExploringNLP/blob/main/Machine_Learning_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Read in and clean text**

In [None]:

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']
data.head()


Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,ham,"Nah I don't think he goes to usf, he lives aro..."
2,ham,Even my brother is not like to speak with me. ...
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnamin...


In [None]:
'''
def count_punct(text):
  count = sum([1 for char in text if char is string.punctuation])
  return round(count/(len(text) - text.count(" " )), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))
'''

def count_punct(text):
  count = sum([1 for char in text if char in string.punctuation])
  return round(count/(len(text) - text.count(" ")), 3)*100  # multiplying by 100 gives a percentage

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))
data.head()


def clean_text(text):
  text = "".join([word for word in text if word not in string.punctuation])
  tokens = re.split('\W+', text)
  text = [ps.stem(word) for word in tokens if word not in stopwords]
  return text





In [None]:

# TF-IDF

'''
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()
'''

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
# X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features = pd.concat([data[['body_len', 'punct%']], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

# To ensure that all feature names are of the same data type, either
# str or int. Here, I am converting all feature names to strings.

# X_features.columns = X_features.columns.astype(str)

In [None]:
# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)
X_count_feat.head()

# Here, we are testing to see which of these vectorizing frameworks works better

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8178,8179,8180,8181,8182,8183,8184,8185,8186,8187
0,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Explore RandomForestClassifier Attributes & Hyperparameters

In [None]:
from sklearn.ensemble import RandomForestClassifier

print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_n_features', '_compute_oob_predictions', '_estimator_type', '_get_oob_predictions', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_parameter_constraints', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score_and_attributes', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_params', '_validate_y_class_weight', 'apply', 'base_estimator_', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predic

**RandomForestClassifier through Cross-Validation**

In [None]:
from sklearn.model_selection import KFold, cross_val_score


# To ensure that all feature names are of the same data type, either
# str or int. Here, I am converting all feature names to strings.

X_features.columns = X_features.columns.astype(str)



rf = RandomForestClassifier(n_jobs=-1) # For jobs that can be run in parallel, set n_jobs = -1
k_fold = KFold(n_splits=5)             # How many folds in our cross-validation?
cross_val_score(rf, X_features, data['label'], cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.97486535, 0.97845601, 0.97663971, 0.96226415, 0.97574124])

In [None]:
# First run, it accurately predicted 97.3% of the samples, 2nd run, it
# accurately predicted 97.8%

**Explore RandomForestClassifier through Holdout Set**

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)
# By default, it sorts in ascending order. So we have to do reverse=True such that it sorts in descending order.
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]


[(0.05730679096410286, 'body_len'),
 (0.04118788480103458, '7419'),
 (0.0404850078298777, '1818'),
 (0.025916268830160582, '3158'),
 (0.0223755555211939, '2047'),
 (0.02139248365997517, '4835'),
 (0.019483763461564357, '5776'),
 (0.019402090395541534, '7532'),
 (0.01845844653177266, '1371'),
 (0.017389663113463648, '2315')]

In [None]:
# Notice the actual words don't become the column names, they are assigned a number such as '1818', '3158', '7419' etc.

# Let's now jump into the predicting phase.
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [None]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                         round(recall, 3),
                                                         round((y_pred == y_test).sum()/ len(y_pred), 3)))

Precision: 1.0 / Recall: 0.59 / Accuracy: 0.949


In [None]:
# Our precision is 97.6%, recall is 62.1% and accuracy is 95.3%.
# The spam that has come in is 97.6% accurate that is is spam.  62.1% of the spam that that has come into your email was correctly placed
# in the spam folder. Which means that 37.9% of spam went into your inbox. 95.3% of email that have come into your email were correctly
# identified as spam or ham.

**Random Forest with Grid Search**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [None]:
def train_RF(n_est, depth):
  rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
  rf_model = rf.fit(X_train, y_train)
  y_pred = rf_model.predict(X_test)
  precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
  print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
      n_est, depth, round(precision, 3), round(recall, 3),
      round((y_pred == y_test).sum() / len(y_pred), 3)))

In [None]:
for n_est in [10, 50, 100]:
  for depth in [10, 20, 30, None]:
    train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 1.0 / Recall: 0.185 / Accuracy: 0.89
Est: 10 / Depth: 20 ---- Precision: 0.989 / Recall: 0.596 / Accuracy: 0.944
Est: 10 / Depth: 30 ---- Precision: 0.991 / Recall: 0.722 / Accuracy: 0.961
Est: 10 / Depth: None ---- Precision: 1.0 / Recall: 0.808 / Accuracy: 0.974
Est: 50 / Depth: 10 ---- Precision: 1.0 / Recall: 0.305 / Accuracy: 0.906
Est: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.57 / Accuracy: 0.942
Est: 50 / Depth: 30 ---- Precision: 1.0 / Recall: 0.702 / Accuracy: 0.96
Est: 50 / Depth: None ---- Precision: 1.0 / Recall: 0.828 / Accuracy: 0.977
Est: 100 / Depth: 10 ---- Precision: 1.0 / Recall: 0.311 / Accuracy: 0.907
Est: 100 / Depth: 20 ---- Precision: 1.0 / Recall: 0.629 / Accuracy: 0.95
Est: 100 / Depth: 30 ---- Precision: 1.0 / Recall: 0.702 / Accuracy: 0.96
Est: 100 / Depth: None ---- Precision: 1.0 / Recall: 0.828 / Accuracy: 0.977


In [None]:
# Certain parameters impact the aggressiveness in the model. As the depth increases from 10, to 20, to 30, and eventually to none, the
# recall increases quite drastically, while the precision doesn't drop. So the model is getting much better and more aggressive as the depth
# increases.

**Evaluate Random Forest with GridSearchCV**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X_features.columns = X_features.columns.astype(str)

rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
         'max_depth': [30,60,90, None]}
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_features, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,17.330249,0.392,0.266503,0.009143,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977558,0.976661,0.977538,0.967655,0.973944,0.974671,0.003749,1
3,1.521066,0.226418,0.192206,0.065126,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.977558,0.974865,0.974843,0.971249,0.973944,0.974492,0.002024,2
8,35.265826,0.559558,0.41384,0.066691,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977558,0.976661,0.975741,0.965858,0.974843,0.974132,0.004235,3
6,1.647044,0.228303,0.158706,0.001565,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.981149,0.975763,0.974843,0.973944,0.96496,0.974132,0.005226,4
11,34.960615,4.128613,0.468007,0.156556,,300,"{'max_depth': None, 'n_estimators': 300}",0.975763,0.975763,0.973944,0.966757,0.974843,0.973414,0.003396,5


In [None]:
print(X_count_feat.dtypes)

body_len      int64
punct%      float64
0             int64
1             int64
2             int64
             ...   
8183          int64
8184          int64
8185          int64
8186          int64
8187          int64
Length: 8190, dtype: object


In [None]:
'''
if not all(X_count_feat.dtypes == str):
  X_count_feat = X_count_feat.astype(str)
'''

In [None]:
# After having printed out the X_count_feat data types, I converted the feature names to strings
X_count_feat.columns = X_count_feat.columns.astype(str)

rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
         'max_depth': [30,60,90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,19.095801,0.473892,0.292366,0.004307,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977558,0.97307,0.974843,0.967655,0.975741,0.973773,0.003384,1
10,20.696857,0.794341,0.358448,0.076872,,150,"{'max_depth': None, 'n_estimators': 150}",0.975763,0.976661,0.974843,0.965858,0.972147,0.973054,0.003903,2
8,36.747871,0.781645,0.567322,0.190615,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.975763,0.975763,0.973944,0.96496,0.973944,0.972875,0.00404,3
3,1.968018,0.301557,0.197402,0.050243,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.974865,0.978456,0.968553,0.972147,0.97035,0.972875,0.003484,4
11,37.573212,4.502389,0.529506,0.178334,,300,"{'max_depth': None, 'n_estimators': 300}",0.975763,0.974865,0.972147,0.96496,0.972147,0.971977,0.003794,5


**Gradient Boosting Grid Search**

**Explore GradientBoostingClassifier Attributes & Hyperparameters**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

print(dir(GradientBoostingClassifier))
print(GradientBoostingClassifier())


['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_initialized', '_check_n_features', '_check_params', '_clear_state', '_compute_partial_dependence_recursion', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_get_tags', '_init_state', '_is_initialized', '_make_estimator', '_more_tags', '_parameter_constraints', '_raw_predict', '_raw_predict_init', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_resize_state', '_staged_raw_predict', '_validate_data', '_validate_estimator', '_validate_params

In [None]:
# No end_jobs parameter because each iteration builds on the prior iteration. Learning rate determines how quickly an algorithm optimizes,
# but it also has performance implications. It could cause the model to optimize too quickly, without truly finding the best model.



**Build Our Own Grid-search**

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [None]:
def train_GB(est, max_depth, lr):
  gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
  gb_model = gb.fit(X_train, y_train)
  y_pred = gb_model.predict(X_test)
  precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
  print('Est: {} / Depth: {} / LR: {} ---- Precision: {} /Recall: {} / Accuracy: {}'. format(
      est, max_depth, lr, round(precision, 3), round(recall, 3),
      round((y_pred==y_test).sum() / len(y_pred), 3)))


In [None]:
for n_est in [50, 100, 150]:
  for max_depth in [3, 7, 11, 15]:
    for lr in [0.01, 0.1, 1]:
      train_GB(n_est, max_depth, lr)

  _warn_prf(average, modifier, msg_start, len(result))


Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 0.0 /Recall: 0.0 / Accuracy: 0.871
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.939 /Recall: 0.75 / Accuracy: 0.961
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.931 /Recall: 0.84 / Accuracy: 0.971
Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 1.0 /Recall: 0.007 / Accuracy: 0.872
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.94 /Recall: 0.868 / Accuracy: 0.976
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.912 /Recall: 0.861 / Accuracy: 0.971
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 1.0 /Recall: 0.007 / Accuracy: 0.872
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.933 /Recall: 0.868 / Accuracy: 0.975
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.919 /Recall: 0.861 / Accuracy: 0.972


  _warn_prf(average, modifier, msg_start, len(result))


Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 0.0 /Recall: 0.0 / Accuracy: 0.871
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.874 /Recall: 0.868 / Accuracy: 0.967
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.914 /Recall: 0.882 / Accuracy: 0.974
Est: 100 / Depth: 3 / LR: 0.01 ---- Precision: 0.938 /Recall: 0.521 / Accuracy: 0.934
Est: 100 / Depth: 3 / LR: 0.1 ---- Precision: 0.968 /Recall: 0.833 / Accuracy: 0.975
Est: 100 / Depth: 3 / LR: 1 ---- Precision: 0.901 /Recall: 0.819 / Accuracy: 0.965
Est: 100 / Depth: 7 / LR: 0.01 ---- Precision: 0.972 /Recall: 0.736 / Accuracy: 0.963
Est: 100 / Depth: 7 / LR: 0.1 ---- Precision: 0.947 /Recall: 0.875 / Accuracy: 0.978
Est: 100 / Depth: 7 / LR: 1 ---- Precision: 0.939 /Recall: 0.854 / Accuracy: 0.974
Est: 100 / Depth: 11 / LR: 0.01 ---- Precision: 0.951 /Recall: 0.812 / Accuracy: 0.97
Est: 100 / Depth: 11 / LR: 0.1 ---- Precision: 0.933 /Recall: 0.868 / Accuracy: 0.975
Est: 100 / Depth: 11 / LR: 1 ---- Precision: 0.941 /Recall: 0.882 / Ac

In [None]:
# Because gradient boosting cannot be parallelized, it would take an hour or two to finish all the

**Evaluate Gradient Boosting with GridSearchCV**

**Grid search**: Exhaustively search all parameter combinations in a given grid to determine the best model.

**Cross-validation**: Divide a dataset into k subsets and repeat the holdout method k times where a different subset is used in the holdout set in each iteration.

**Exploring parameter settings using GridSearchCV**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150],
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

gs = GridSearchCV(gb, param, cv=5, n_jobs=1)
cv_fit = gs.fit(X_features, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,127.936339,1.116285,0.196386,0.041194,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.965889,0.979354,0.969452,0.972147,0.965858,0.97054,0.005,1
3,192.780389,1.892565,0.213277,0.03992,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964093,0.980251,0.97035,0.971249,0.965858,0.97036,0.005624,2
4,158.272504,1.450394,0.205853,0.028655,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.967684,0.976661,0.968553,0.97035,0.968553,0.97036,0.003268,2
5,250.992996,2.798971,0.195352,0.009268,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.967684,0.976661,0.971249,0.97035,0.96496,0.970181,0.003915,4
0,82.743301,0.635808,0.158593,0.032344,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.965889,0.978456,0.968553,0.969452,0.965858,0.969642,0.004633,5


**Run on Countvectorizer and see how it performs**

In [None]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150],
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

gs_2 = GridSearchCV(gb, param, cv=5, n_jobs=1)
cv_fit_2 = gs_2.fit(X_count_feat, data['label'])
pd.DataFrame(cv_fit_2.cv_results_).sort_values('mean_test_score', asending=False)[0:5]



Vectorizers are like models. They need to be fit on a training set, and in the context of a vectorizer, it means it stores all of the words in the training set. Any word that appear in the test set but not in the training set will not appear in the vectorized version of the test set. The vectorizer will only recognize the words that are in the training set.

In [None]:
# Split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

**Vectorize text**

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vec.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.trasnform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True),
                          pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True),
                         pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()


**Final evaluation of models**

In [None]:
from sklearn.ensemble imprt RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

SyntaxError: invalid syntax (<ipython-input-22-326a5dca6b89>, line 1)

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = end - start # Time taken for random forest classifier to fit

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = end - start # Time taken for random forest classifier to predict

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print(' Fit time : {} / Predict time: {} ----Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3),
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_pred), 3)))

In [None]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = end - start # Time taken for gradient boosting classifier to fit

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = end - start # Time taken for gradient boosting classifier to predict

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print(' Fit time : {} / Predict time: {} ----Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3),
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_pred), 3)))