In [None]:
pip install pandas joblib scikit-learn

In [9]:
import os
import csv
import json
import re
import chardet
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# Get the base directory
base_dir = os.getcwd()

# Define input and output file paths
input_file_path = os.path.join(base_dir, "Input_Files", "spam.csv")
output_file_path = os.path.join(base_dir, "Input_Json_Files", "spam.json")

# Detect encoding
with open(input_file_path, 'rb') as raw_file:
    result = chardet.detect(raw_file.read())
    encoding_type = result['encoding']

# Function to clean text
def clean_text(text):
    text = text.strip()  # Remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = text.replace(',,', '')  # Remove unnecessary commas
    return text

# List to store cleaned data
cleaned_data = []

try:
    # Read and clean the raw dataset with detected encoding
    with open(input_file_path, 'r', encoding=encoding_type, errors='replace') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the first row (header)

        for row in reader:
            if len(row) >= 2:  # Ensure row has at least label and message
                label = row[0].strip().lower()  # Convert label to lowercase
                message = clean_text(row[1])
                cleaned_data.append({"label": label, "message": message})

    # Create the necessary directories if they don't exist
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Save cleaned data as JSON
    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(cleaned_data, json_file, indent=4, ensure_ascii=False)

    print(f"Conversion completed! JSON file '{output_file_path}' is ready.")
except FileNotFoundError:
    print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Conversion completed! JSON file 'd:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Input_Json_Files\spam.json' is ready.


In [None]:
base_dir = os.getcwd()
file_path = os.path.join(base_dir,"Input_Json_Files", "spam.json")
print(file_path)


d:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Input_Json_Files\spam.json


In [5]:
df_train = pd.read_json(file_path)

In [6]:
df_train.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
def load_data(filepath):
    """
    Load data from a JSON file into a Pandas DataFrame.
    """
    df = pd.read_json(filepath)
    return df

def preprocess_data(df):
    """
    Preprocess the data by dropping rows with missing DESCRIPTION or GENRE.
    """
    df = df.dropna(subset=['label', 'message'])
    return df

def train_model(train_file):
    """
    Train a text classification model and save the pipeline.
    """
    # Load and preprocess data
    df = load_data(train_file)
    df = preprocess_data(df)

    # Split data into features (message) and target (label)
    X_train, X_test, y_train, y_test = train_test_split(df['label'], df['message'], test_size=0.2, random_state=42)

    # Create a pipeline with TF-IDF Vectorizer and Logistic Regression
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
        ('classifier', LogisticRegression(max_iter=1000, C=1.0))
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy:", accuracy)

    # Ensure Model_Trained directory exists
    model_dir = os.path.join(os.getcwd(), "Model_Trained")
    os.makedirs(model_dir, exist_ok=True)

    # Save the entire pipeline (both vectorizer and model) in Model_Trained folder
    model_output_path = os.path.join(model_dir, "spam_detection.pkl")
    joblib.dump(pipeline, model_output_path)
    print(f"Pipeline saved to '{model_output_path}'.")

if __name__ == "__main__":
    # Path to the training data
    train_file = os.path.join(os.getcwd(),"Input_Json_Files", "spam.json")
    train_model(train_file)


Model Accuracy: 0.007174887892376682
Pipeline saved to 'd:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Model_Trained\spam_detection.pkl'.


In [None]:
def load_model(model_path):
    """
    Load the trained pipeline from the saved file.
    """
    try:
        pipeline = joblib.load(model_path)
        print("Model loaded successfully.")
        return pipeline
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def predict_spam(model, message):
    """
    Predict the genre of a movie based on its DESCRIPTION.
    """
    try:
        # Predict genre using the model
        predicted_genre = model.predict([message])[0]

        # Return the predicted genre
        return predicted_genre
    except Exception as e:
        print(f"Error predicting genre: {e}")
        return None

def get_message():
    """
    Prompt the user to input a movie description.
    """
    print("Please enter the description of the movie:")
    description = input()  # Takes user input as description
    return description

def predict_spam_from_input(model_path):
    """
    Load the model and predict the genre for an individual movie description.
    """
    # Load the trained model
    model = load_model(model_path)
    if not model:
        return

    # Get movie description from user input
    message = get_message()

    # Predict genre
    predicted_genre = predict_spam(model, message)

    # Display the result
    if predicted_genre:
        print(f"The message is: {predicted_genre} message")
    else:
        print("Unable to predict genre.")

if __name__ == "__main__":
    # Path to the trained model
    model_path = os.path.join(os.getcwd(), "Model_Trained", "spam_detection.pkl")
    
    predict_spam_from_input(model_path)


Model loaded successfully.
Please enter the description of the movie:
The predicted genre for the movie is: Sorry, I'll call later
