In [None]:
pip install pandas joblib scikit-learn

In [4]:
import os
import csv
import json
import re
import chardet
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

In [11]:
# Get the base directory
base_dir = os.getcwd()

# Define input and output file paths
input_file_path = os.path.join(base_dir, "Input_Files", "spam.csv")
output_file_path = os.path.join(base_dir, "Input_Json_Files", "spam.json")

# Detect encoding
with open(input_file_path, 'rb') as raw_file:
    result = chardet.detect(raw_file.read())
    encoding_type = result['encoding']

# Function to clean text
def clean_text(text):
    text = text.strip()  # Remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = text.replace(',,', '')  # Remove unnecessary commas
    return text

# List to store cleaned data
cleaned_data = []

try:
    # Read and clean the raw dataset with detected encoding
    with open(input_file_path, 'r', encoding=encoding_type, errors='replace') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the first row (header)

        for row in reader:
            if len(row) >= 2:  # Ensure row has at least label and message
                label = row[0].strip().lower()  # Convert label to lowercase
                message = clean_text(row[1])
                cleaned_data.append({"label": label, "message": message})

    # Create the necessary directories if they don't exist
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Save cleaned data as JSON
    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(cleaned_data, json_file, indent=4, ensure_ascii=False)

    print(f"Conversion completed! JSON file '{output_file_path}' is ready.")
except FileNotFoundError:
    print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Conversion completed! JSON file 'd:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Input_Json_Files\spam.json' is ready.


In [None]:
base_dir = os.getcwd()
file_path = os.path.join(base_dir,"Input_Json_Files", "spam.json")
print(file_path)


d:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Input_Json_Files\spam.json


In [5]:
df_train = pd.read_json(file_path)

In [6]:
df_train.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
def load_data(input_file):
    """
    Load and preprocess the training data.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            data = json.load(file)
        print("Training data loaded successfully.")
        return data
    except Exception as e:
        print(f"Error loading training data: {e}")
        return None

def train_model(input_file, model_path):
    """
    Train and save the spam classification model.
    """
    # Load training data
    data = load_data(input_file)
    if not data:
        return
    
    # Extract messages and labels
    messages = [item["message"] for item in data]
    labels = [item["label"] for item in data]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(messages, labels, test_size=0.2, random_state=42)
    
    # Create a text processing and classification pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    print("Model training completed.")
    
    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    print("Model Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Ensure the Model_Trained directory exists
    model_dir = os.path.dirname(model_path)
    os.makedirs(model_dir, exist_ok=True)
    
    # Save the trained model
    joblib.dump(pipeline, model_path)
    print(f"Model saved at '{model_path}'.")

if __name__ == "__main__":
    # Path to the training data and model save location
    input_file = os.path.join(os.getcwd(), "Input_Json_Files", "spam.json")
    model_path = os.path.join(os.getcwd(), "Model_Trained", "spam_detection.pkl")
    
    train_model(input_file, model_path)

Training data loaded successfully.
Model training completed.
Model Accuracy: 0.9623318385650225
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Model saved at 'd:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Model_Trained\spam_detection.pkl'.


In [12]:
def evaluate_model(model_path):
    """
    Evaluate a single message using the trained model.
    """
    try:
        model = joblib.load(model_path)
        message = input("Enter a message to classify: ")
        prediction = model.predict([message])[0]
        print(f"Message: {message}\nPredicted Label: {prediction}")
        return prediction
    except Exception as e:
        print(f"Error loading model or predicting: {e}")
        return None

if __name__ == "__main__":
    # Path to the training data and model save location
    model_path = os.path.join(os.getcwd(), "Model_Trained", "spam_detection.pkl")
    
    train_model(input_file, model_path)
    
    # Evaluate user input
    evaluate_model(model_path)

Training data loaded successfully.
Model training completed.
Model Accuracy: 0.9623318385650225
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Model saved at 'd:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Model_Trained\spam_detection.pkl'.
Message: hello my name is niranjan
Predicted Label: ham
