## First Step:
### Preparing our data as previously seen
We have:
. character count
. punctuation percentage(transformed into a balanced distribution)
. coleman-liau index
. tf-idf values for each token.

In [1]:
import string
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

body_len = data['body_text'].str.replace(' ', '').str.len()
data['body_len'] = body_len

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count / (len(text) - text.count(' ')) * 100
data['punct%'] = data['body_text'].apply(count_punct)

def coleman_liau_index(text):
    n_sent = len(sent_tokenize(text))
    n_words = len(word_tokenize(text))
    n_letters = len(text) - text.count(' ')
    return 5.88 * (n_letters/n_words) - 29.6 * (n_sent/n_words) - 15.8
data['cl_index'] = data['body_text'].apply(coleman_liau_index)

In [2]:
data

Unnamed: 0,label,body_text,body_len,punct%,cl_index
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.687500,2.941622
1,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.081633,1.434667
2,ham,Even my brother is not like to speak with me. ...,62,3.225806,1.164444
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.142857,-5.256000
4,ham,As per your request 'Melle Melle (Oru Minnamin...,135,4.444444,7.896774
...,...,...,...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...,131,6.106870,2.825143
5563,ham,Will ü b going to esplanade fr home?,29,3.448276,-0.142222
5564,ham,"Pity, * was in mood for that. So...any other s...",48,14.583333,-0.930667
5565,ham,The guy did some bitching but I acted like i'd...,100,1.000000,4.881481


In [9]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = nltk.corpus.stopwords.words('english')
wl = nltk.WordNetLemmatizer()
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wl.lemmatize(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X = pd.DataFrame(X_tfidf.toarray())
X.columns = tfidf_vect.get_feature_names_out()

In [11]:
X['body_len'] = data['body_len']
X['punct%'] = data['punct%']**(1/5) # our transformation seen on feature_engineering
X['cl_index'] = data['cl_index']
y = data['label']
X

Unnamed: 0,Unnamed: 1,0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,...,zoom,zouk,zyada,é,ü,üll,〨ud,body_len,punct%,cl_index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,128,1.362035,2.941622
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,49,1.324850,1.434667
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,62,1.263944,1.164444
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,28,1.481748,-5.256000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,135,1.347608,7.896774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,131,1.436031,2.825143
5563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.32906,0.0,0.0,29,1.280915,-0.142222
5564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,48,1.709115,-0.930667
5565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,100,1.000000,4.881481


## Second Step:
### Training and cross-validating

Models to use:
. Random forest
. XGBoost
. Neural Networks(?)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import inspect

inspect.signature(RandomForestClassifier)

<Signature (n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)>

In [20]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X, y, cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.97755835, 0.97755835, 0.97394429, 0.96675651, 0.97214735])

## Third Step
### Exploring the model

In [23]:
rf.fit(X, y)
feature_importances = rf.feature_importances_
feature_names = X.columns
pd.DataFrame(
    {
    'Feature': feature_names,
    'Importance': feature_importances
    }
).sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
8911,body_len,0.050803
8913,cl_index,0.032452
1899,call,0.032016
2154,claim,0.020344
8100,txt,0.020127
...,...,...
5791,outage,0.000000
5793,outdoors,0.000000
5794,outfit,0.000000
5795,outfor,0.000000


In [29]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

In [30]:
rf = RandomForestClassifier(n_estimators=50,
                            max_depth=20,
                            n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [32]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.05807321618366297, 'body_len'),
 (0.049522480809207584, 'cl_index'),
 (0.04357468031941846, 'txt'),
 (0.02364131295826045, 'text'),
 (0.02215134385074408, 'call'),
 (0.022131508081918813, 'free'),
 (0.021330942481623004, 'prize'),
 (0.020887526581213893, 'mobile'),
 (0.020541441848161524, 'win'),
 (0.020222884267934126, 'claim')]

In [33]:
y_pred = rf_model.predict(X_test)

precision, recall, fscore, support = score(y_test,
                                           y_pred,
                                           pos_label='spam',
                                           average='binary')


In [34]:
precision, recall

(1.0, 0.6089743589743589)

### Some grid searching

In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Create a RandomForestClassifier instance
rf = RandomForestClassifier()

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

405 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
217 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/leo_rsnd/miniconda3/envs/nlp/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/leo_rsnd/miniconda3/envs/nlp/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/leo_rsnd/miniconda3/envs/nlp/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/leo_rsnd/miniconda3/envs/nlp/lib/python3.10/site-packages/sklearn/utils/_

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best score: 0.9757483373417793


## Last step:
### Check out "holdout learning/training"
It's the concept that real word data will have tokens not available in our model. So we split the data into train and test BEFORE applying the transformations. Meaning our vocabulary will probably be smaller.