In [None]:
import pandas as pd 
file_path = "mail_data.csv"
#it will load and datset 
data = pd.read_csv(file_path)

#for displaying first few rows of the dataset
data.head()

In [None]:
# Importing libraries pre-processing
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Pre-processing function
def preprocess_text(text):
    # for Lowercasing
    text = text.lower()
    
    # for Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    words = word_tokenize(text)
    
    # forRemoving stop words and lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    
    return ' '.join(words)

# Apply the preprocessing to the 'Message' column
data['Processed_Message'] = data['Message'].apply(preprocess_text)

# Display the first few rows to check the processed messages
data[['Message', 'Processed_Message']].head()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# Step 1: Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=3000)  # Limit to top 3000 features
X = tfidf_vectorizer.fit_transform(data['Processed_Message'])

# Step 2: Apply Best First Feature Selection using Chi-Square method
# (Here we assume labels are binary, i.e., spam and non-spam)
y = data['Category'].map({'ham': 0, 'spam': 1})  # Convert labels to binary

# Selecting the top 1000 features
selector = SelectKBest(chi2, k=1000)
X_selected = selector.fit_transform(X, y)

# Display selected feature names (optional)
selected_features = tfidf_vectorizer.get_feature_names_out()[selector.get_support()]
print("Top selected features:\n", selected_features)


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import time

# Assume X_selected and y are defined and preprocessed
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Initialize and train the Multinomial Naive Bayes model
model_mnb = MultinomialNB()
start_time = time.time()
model_mnb.fit(X_train, y_train)
end_time = time.time()

# Make predictions and evaluate the model
y_pred_mnb = model_mnb.predict(X_test)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
conf_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
classification_rep_mnb = classification_report(y_test, y_pred_mnb)

# Print results
print("Multinomial Naive Bayes")
print(f"Training Time: {end_time - start_time:.4f} seconds")
print(f"Accuracy: {accuracy_mnb:.4f}")
print("Confusion Matrix:")
print(conf_matrix_mnb)
print("Classification Report:")
print(classification_rep_mnb)
print("\n")


Multinomial Naive Bayes
Training Time: 0.0060 seconds
Accuracy: 0.9791
Confusion Matrix:
[[1448    0]
 [  35  189]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1448
           1       1.00      0.84      0.92       224

    accuracy                           0.98      1672
   macro avg       0.99      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672





In [14]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model
model_dt = DecisionTreeClassifier()
start_time = time.time()
model_dt.fit(X_train, y_train)
end_time = time.time()

# Make predictions and evaluate the model
y_pred_dt = model_dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
classification_rep_dt = classification_report(y_test, y_pred_dt)

# Print results
print("Decision Tree (J48)")
print(f"Training Time: {end_time - start_time:.4f} seconds")
print(f"Accuracy: {accuracy_dt:.4f}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print("Classification Report:")
print(classification_rep_dt)
print("\n")


Decision Tree (J48)
Training Time: 0.2355 seconds
Accuracy: 0.9647
Confusion Matrix:
[[1431   17]
 [  42  182]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1448
           1       0.91      0.81      0.86       224

    accuracy                           0.96      1672
   macro avg       0.94      0.90      0.92      1672
weighted avg       0.96      0.96      0.96      1672





In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

# Convert sparse matrix to dense format
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Initialize and train the Gaussian Naive Bayes model
model_gnb = GaussianNB()
start_time = time.time()
model_gnb.fit(X_train_dense, y_train)
end_time = time.time()

# Make predictions and evaluate the model
y_pred_gnb = model_gnb.predict(X_test_dense)
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
conf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)
classification_rep_gnb = classification_report(y_test, y_pred_gnb)

# Print results
print("Gaussian Naive Bayes")
print(f"Training Time: {end_time - start_time:.4f} seconds")
print(f"Accuracy: {accuracy_gnb:.4f}")
print("Confusion Matrix:")
print(conf_matrix_gnb)
print("Classification Report:")
print(classification_rep_gnb)
print("\n")


Gaussian Naive Bayes
Training Time: 0.0992 seconds
Accuracy: 0.9719
Confusion Matrix:
[[1419   29]
 [  18  206]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1448
           1       0.88      0.92      0.90       224

    accuracy                           0.97      1672
   macro avg       0.93      0.95      0.94      1672
weighted avg       0.97      0.97      0.97      1672



