In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Download necessary NLTK data
nltk.download('stopwords')

def load_data(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)
    return df

def perform_eda(df):
    # Display basic information about the dataset
    print(df.info())
    print(df.describe())

    # Check class distribution
    plt.figure(figsize=(8, 6))
    sns.countplot(x='label', data=df)
    plt.title('Distribution of Spam and Ham messages')
    plt.savefig('class_distribution.png')
    plt.close()

    # Message length distribution
    df['message_length'] = df['text'].apply(len)
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='message_length', hue='label', kde=True)
    plt.title('Distribution of Message Lengths')
    plt.savefig('message_length_distribution.png')
    plt.close()

    # Word count distribution
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='label', y='word_count', data=df)
    plt.title('Word Count Distribution by Label')
    plt.savefig('word_count_distribution.png')
    plt.close()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

def prepare_data(df):
    # Preprocess the text data
    df['processed_message'] = df['text'].apply(preprocess_text)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df['processed_message'], df['label'], test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

if __name__ == "__main__":
    # Assuming you have a CSV file named 'spam_ham_dataset.csv' with columns 'message' and 'label'
    df = load_data('email_data.csv')
    perform_eda(df)
    X_train, X_test, y_train, y_test = prepare_data(df)
    print("Data preprocessing completed.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB
None
        Unnamed: 0    label_num
count  5171.000000  5171.000000
mean   2585.000000     0.289886
std    1492.883452     0.453753
min       0.000000     0.000000
25%    1292.500000     0.000000
50%    2585.000000     0.000000
75%    3877.500000     1.000000
max    5170.000000     1.000000
Data preprocessing completed.


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import pickle
#from data_preprocessing import load_data, prepare_data

def create_model():
    # Create a pipeline with CountVectorizer and MultinomialNB
    model = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('classifier', MultinomialNB())
    ])
    return model

def train_model(model, X_train, y_train):
    # Train the model
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)

    # Print classification report
    print(classification_report(y_test, y_pred))

    # Print confusion matrix
    print(confusion_matrix(y_test, y_pred))

def save_model(model, file_path):
    # Save the model using pickle
    with open(file_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {file_path}")

if __name__ == "__main__":
    # Load and prepare data
    df = load_data('email_data.csv')
    X_train, X_test, y_train, y_test = prepare_data(df)

    # Create and train model
    model = create_model()
    trained_model = train_model(model, X_train, y_train)

    # Evaluate model
    evaluate_model(trained_model, X_test, y_test)

    # Save model
    save_model(trained_model, 'spam_ham_model.pkl')

              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       742
        spam       0.95      0.95      0.95       293

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035

[[728  14]
 [ 14 279]]
Model saved to spam_ham_model.pkl
