Collecting Data set

In [2]:
# Step 1: Collect Data Set
import pandas as pd

# Load the dataset
data = pd.read_csv('mail_data.csv')

# Display the first few rows of the dataset to understand its structure
print(data.head())
print(data.shape)



  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


Pre-Processing

In [25]:
# Step 2: Pre-processing
# Removing missing Values
data.dropna(inplace=True)
data.fillna('', inplace=True)
print(data.shape)

(5572, 2)


In [28]:
# label spam mail as 0 and ham as 1
data.loc[data['Category'] == 'spam', 'Category',] = 0
data.loc[data['Category'] == 'ham', 'Category',] = 1

data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


Feature Extraction

In [57]:
# using TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif

X = data['Message']
y = data['Category'].astype(int) # convert the y array into (int)

# Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(X)


Feature Selection

In [30]:
# Step 3: Feature Selection
# Select the best features using SelectKBest
num_features_to_select = 10  
selector = SelectKBest(score_func=f_classif, k=num_features_to_select)
X_new = selector.fit_transform(X_tfidf, y)
selected_features_indices = selector.get_support()

In [31]:
# Display selected features
print("Selected Features:", selected_features_indices)

Selected Features: [False False False ... False False False]


Apply Spam Filter Algorithms

In [32]:
# Step 4: Apply Spam Filter Algorithms
# Split data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)

Navive Bayes

In [33]:
#First i am trying  Naive Bayes algorithm 
from sklearn.naive_bayes import MultinomialNB

# Initialize the model
model = MultinomialNB()

In [34]:
# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

Accuracy

In [35]:
# Step 7: Accuracy
# Calculate accuracy of the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8660287081339713


DecisionTree

In [36]:
# Now i am  trying another algorithm DecisionTree.
from sklearn.tree import DecisionTreeClassifier


In [37]:
# Initialize the model
tree_model = DecisionTreeClassifier()

# Train the model
tree_model.fit(X_train, y_train)

# Predict using the tree model
tree_pred = tree_model.predict(X_test)

Decision Tree Accuracy

In [58]:
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)

Decision Tree Accuracy: 0.8660287081339713


Naive Bayes Multinomial

In [45]:
# Now consider Naive Bayes Multinomial algorithm
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes model
nb_model = MultinomialNB()

In [46]:
# Train the model
nb_model.fit(X_train, y_train)


In [59]:
# Predict on the test set
y_pred_nb = nb_model.predict(X_test)

Naive Bayes Multinomial Accuracy

In [60]:
# Calculate accuracy of the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Multinomial Accuracy:", accuracy_nb)

Naive Bayes Multinomial Accuracy: 0.8660287081339713


 J48 (C4.5 Decision Tree)

In [51]:
# Now lets consider J48 (C4.5 Decision Tree) algorithm
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
tree_model = DecisionTreeClassifier()

In [52]:
# Train the model
tree_model.fit(X_train, y_train)

In [53]:
# Predict on the test set
y_pred_tree = tree_model.predict(X_test)

 J48 (C4.5 Decision Tree) Accuracy

In [55]:
# Calculate accuracy of the model
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print("J48 (C4.5 Decision Tree) Accuracy:", accuracy_tree)

J48 (C4.5 Decision Tree) Accuracy: 0.9485645933014354


Table to Display the Result of Different Algorithams

In [56]:
from prettytable import PrettyTable

# Create a table
table = PrettyTable()
table.field_names = ["Algorithm", "Accuracy"]

# Add rows to the table
table.add_row(["Decision Tree ", tree_accuracy])
table.add_row(["Naive Bayes", accuracy])
table.add_row(["Naive Bayes Multinomial", accuracy_nb])
table.add_row(["J48 (C4.5 Decision Tree)", accuracy_tree])
# Add rows for other algorithms if needed

# Print the table
print(table)


+--------------------------+--------------------+
|        Algorithm         |      Accuracy      |
+--------------------------+--------------------+
|      Decision Tree       | 0.9473684210526315 |
|       Naive Bayes        | 0.8660287081339713 |
| Naive Bayes Multinomial  | 0.8660287081339713 |
| J48 (C4.5 Decision Tree) | 0.9485645933014354 |
+--------------------------+--------------------+
