In [8]:
import pandas as pd

# define function
def load_save_data(load_file_path, save_file_path):     
    # read csv    
    df = pd.read_csv(load_file_path, encoding="latin-1")    
    # select target columns
    df = df[['message','label']]    
    # dropping ham rows    
    idxs = df[df.label=='ham'].index[:1825]
    df.drop(idxs, inplace=True)    
    # shuffle the DataFrame rows
    df = df.sample(frac = 1)
    df.reset_index(inplace=True, drop=True)    
    # save csv
    df.to_csv(save_file_path, index=False)
    print(f"File saved as: {save_file_path}")


In [9]:
# load the csv file & save it as data.csv
load_save_data(load_file_path="spam.csv", save_file_path="data.csv")

File saved as: data.csv


In [10]:
# load saved csv & look at class balance
df = pd.read_csv("spam.csv",encoding="latin-1")
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [11]:
df.head()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
# import deps
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# initialize objects
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# A bsic text processing function with variations in preprocessing like stemming / lemmatization
def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalpha()]
    # filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    # filtered_words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words]
    # filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

# basic trainin model func with variations for vectorizing
def train_model(x_train, y_train, n, c, d):    
    # Create a Vectorizer to convert text data to numerical features
    # vectorizer = CountVectorizer()
    vectorizer = TfidfVectorizer()
    x_train_vectorized = vectorizer.fit_transform(x_train)
    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=d)
    # Train the classifier on the training data
    rf_classifier.fit(x_train_vectorized, y_train)
    pred = rf_classifier.predict(x_train_vectorized)
    acc = accuracy_score(pred, y_train)
    return vectorizer, rf_classifier, acc

# evaluation function
def eval_met(actual, pred):
    acc = accuracy_score(actual, pred)
    prc = precision_score(actual, pred, pos_label='spam')
    rec = recall_score(actual, pred, pos_label='spam')
    f1 = f1_score(actual, pred, pos_label='spam')
    return acc, prc, rec, f1

In [16]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
# Apply text preprocessing on the message column
df['processed_message'] = df.message.apply(preprocess_text)

# Split the data into features (x) and labels (y)
x = df['processed_message']
y = df['label']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train the model
vectorizer, model , train_acc = train_model(x_train, y_train, n=10, c='entropy', d=2)

# Evaluate the model
y_pred = model.predict(vectorizer.transform(x_test))
acc, prc, rec, f1 = eval_met(y_pred, y_test)

# Print the Results
print(f"Training Accuracy: {train_acc*100:.3f} %")
print(f"Validation Accuracy: {acc*100:.3f} %")
print(f"Precision Score: {prc*100:.3f} %")
print(f"Recall Score: {rec*100:.3f} %")
print(f"F1 Score: {f1*100:.3f} %")

Training Accuracy: 86.605 %
Validation Accuracy: 86.547 %
Precision Score: 0.000 %
Recall Score: 0.000 %
F1 Score: 0.000 %


  _warn_prf(average, modifier, msg_start, len(result))
