# Building Machine Learning Classifiers: Building a basic Random Forest model

### Read in & clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Explore RandomForestClassifier Attributes & Hyperparameters

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_n_features', '_compute_oob_predictions', '_estimator_type', '_get_oob_predictions', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score_and_attributes', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'n_features_', 'predict', 'predict_log_proba', 'predict_proba', 'score',

### Explore RandomForestClassifier through Cross-Validation

In [4]:
from sklearn.model_selection import KFold, cross_val_score

In [5]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.97486535, 0.97755835, 0.97663971, 0.96495957, 0.97124888])

# Importing metrics

In [8]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test= train_test_split(X_features,data['label'],test_size=0.2)

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

rf_model = rf.fit(X_train, y_train)



In [23]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.052425218900979884, 'body_len'),
 (0.046834712612190244, 7350),
 (0.033463672325162806, 3134),
 (0.025876933071620364, 5724),
 (0.021316861989015976, 1803),
 (0.02126603581464077, 392),
 (0.02101717199421741, 2031),
 (0.02061961313728777, 7027),
 (0.020234333956927378, 4796),
 (0.019660838152883258, 7461)]

In [27]:
y_pred =rf_model.predict(X_test)
precision, recall, fscore, support= score(y_test,y_pred, pos_label='spam',average='binary')



In [29]:
print('precision=', precision, 'recall=',recall, 'fscore=',fscore, 'support=',support)
print(round((y_test==y_pred).sum()/len(y_pred),3))

precision= 1.0 recall= 0.5620437956204379 fscore= 0.719626168224299 support= None
0.946


# Hyper-Parameter Tuning Using [GRID SEARCH]

In [30]:
def train_rf(n_est,depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred =rf_model.predict(X_test)
    precision, recall, fscore, support= score(y_test,y_pred, pos_label='spam',average='binary')
    print('N_est=',n_est,'Depth',depth,'------>','precision=', precision, 'recall=',recall, 'fscore=',fscore, 'support=',support)
    print('N_est=',n_est,'Depth',depth,'------>',round((y_test==y_pred).sum()/len(y_pred),3))
    

In [31]:
for n_est in [10, 20, 30]:
    for depth in [20, 50, None]:
        train_rf(n_est,depth)



N_est= 10 Depth 20 ------> precision= 0.9875 recall= 0.5766423357664233 fscore= 0.7281105990783409 support= None
N_est= 10 Depth 20 ------> 0.947




N_est= 10 Depth 50 ------> precision= 0.9747899159663865 recall= 0.8467153284671532 fscore= 0.90625 support= None
N_est= 10 Depth 50 ------> 0.978




N_est= 10 Depth None ------> precision= 0.990990990990991 recall= 0.8029197080291971 fscore= 0.8870967741935484 support= None
N_est= 10 Depth None ------> 0.975




N_est= 20 Depth 20 ------> precision= 1.0 recall= 0.5766423357664233 fscore= 0.7314814814814815 support= None
N_est= 20 Depth 20 ------> 0.948




N_est= 20 Depth 50 ------> precision= 0.9914529914529915 recall= 0.8467153284671532 fscore= 0.9133858267716537 support= None
N_est= 20 Depth 50 ------> 0.98




N_est= 20 Depth None ------> precision= 1.0 recall= 0.8248175182481752 fscore= 0.904 support= None
N_est= 20 Depth None ------> 0.978




N_est= 30 Depth 20 ------> precision= 1.0 recall= 0.583941605839416 fscore= 0.7373271889400921 support= None
N_est= 30 Depth 20 ------> 0.949




N_est= 30 Depth 50 ------> precision= 1.0 recall= 0.8540145985401459 fscore= 0.9212598425196851 support= None
N_est= 30 Depth 50 ------> 0.982
N_est= 30 Depth None ------> precision= 0.9915254237288136 recall= 0.8540145985401459 fscore= 0.9176470588235295 support= None
N_est= 30 Depth None ------> 0.981


