In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the datasets
spam_df = pd.read_csv('spam.csv', encoding='latin-1')
dataset_df = pd.read_csv('Dataset_5971.csv')

In [3]:
# Rename columns for consistency
spam_df = spam_df.rename(columns={'v1': 'LABEL', 'v2': 'TEXT'})
# Drop irrelevant columns in spam_df
spam_df = spam_df[['LABEL', 'TEXT']]

In [4]:
# Combine the datasets
combined_df = pd.concat([spam_df, dataset_df[['LABEL', 'TEXT']]], ignore_index=True)


In [5]:
# Label encode the target variable
label_encoder = LabelEncoder()
combined_df['LABEL'] = label_encoder.fit_transform(combined_df['LABEL'])

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_df['TEXT'], combined_df['LABEL'], test_size=0.2, random_state=42)

In [7]:
# Convert text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=3000)  # Limiting features for efficiency
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Initialize and train XGBoost model without use_label_encoder
xgb_model = XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train)


In [9]:
# Predict on the test set
y_pred = xgb_model.predict(X_test_tfidf)

In [10]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 93.42%


In [11]:
# Function to predict a single message
def predict_message(message):
    # Preprocess and transform the input message
    message_tfidf = tfidf_vectorizer.transform([message])
    # Predict using the model
    prediction = xgb_model.predict(message_tfidf)
    # Decode label to original text form
    prediction_label = label_encoder.inverse_transform(prediction)
    return prediction_label[0]



In [15]:
# Example usage:
input_message = "Account Warning. Suspicious Activity has been detected on your account. To reset security details and password, click the following link. https://ebanking.meezanbank.com/AmbitRetailFrontEnd/login?se=y_sHexp"
print(f"Prediction: {predict_message(input_message)}")

Prediction: Smishing
