# Dataset

In [71]:
#To ignore warnings.
import warnings
warnings.filterwarnings("ignore")

In [49]:
#Load the dataset.
import pandas as pd
df=pd.read_csv("Dataset/spam.csv", encoding='latin-1')
#Remove unwanted columns.
df = df.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
#Name the columns.
df.columns = ["label", "text"]
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data preprocessing

In [4]:
import nltk,string

In [50]:
#Getting the english stopwords.
stopwords = nltk.corpus.stopwords.words('english')

In [51]:
#Function to clean the text.
def clean_text(text):
    #remove punctuation
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    #separate into tokens
    tokens = text.split()
    #remove stopwords
    text = [word for word in tokens if word not in stopwords]
    text=" ".join(text)
    return text


In [52]:
#Apply the clean_text function 
df["cleaned_text"]=df["text"].apply(clean_text)

In [53]:
df.head()

Unnamed: 0,label,text,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


# TF-IDF

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
#Split the data (train, test).
X_train, X_test, y_train, y_test = train_test_split(df,df['label'],test_size=0.2)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
#Tf-idf(Term frequency- inverse document frequency) Vectorizer
tfidf_vectorize = TfidfVectorizer()

In [58]:
#Build tfidf vectors with our dataset.
tfidf_vectorize.fit(X_train["cleaned_text"])
train_vec = tfidf_vectorize.transform(X_train["cleaned_text"])
test_vec = tfidf_vectorize.transform(X_test["cleaned_text"])

In [59]:
#To check the vocabulary.
#tfidf_vectorize.vocabulary_

# Lets build the model !

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
#Build the Random Forest Classifier.
rf=RandomForestClassifier()
#Fit the model with training data.
rf_model=rf.fit(train_vec,y_train)

In [66]:
#Predict the output for testing data.
y_pred=rf_model.predict(test_vec)

# Evaluation

In [67]:
#Calculate precision score and recall score.
from sklearn.metrics import precision_score, recall_score,accuracy_score
#Precision - ability of the classifier to find all the positive samples.
precision = precision_score(y_test, y_pred, pos_label='spam')
#Recall - ability of the classifier to find true positive samples.
recall = recall_score(y_test, y_pred, pos_label='spam')
print("Precision score: ",precision)
print("Recall score: ",recall)


Precision score:  1.0
Recall score:  0.7605633802816901


In [68]:
import numpy as np
y_pred=np.where(y_pred == "spam", 1, 0)

In [69]:
y_test=np.where(np.array(y_test)== "spam", 1, 0)

In [70]:
#Test datapoints accuracy.
accuracy=accuracy_score(y_test, y_pred)
print("Test Accuracy: ",accuracy)


Test Accuracy:  0.9695067264573991
