<a href="https://colab.research.google.com/github/ShabnaIlmi/SpamSense-AI/blob/main/Merging_the_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing the Relevant Libaries**

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import pickle

# **Mounting the Google Drive**

In [2]:
# Mounting the google drive
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

# **Loading the Datasets**

In [None]:
# Defining the file path
file_path_true = "/content/drive/MyDrive/Multi-Type-Spam-Detection/Dataset/True_Cleaned.csv"

# Loading the dataset with the correct delimiter (semicolon)
data_true = pd.read_csv(file_path_true, encoding="utf-8")

In [None]:
# Defining the file path
file_path_false = "/content/drive/MyDrive/Multi-Type-Spam-Detection/Dataset/Fake_Cleaned.csv"

# Loading the dataset with the correct delimiter (semicolon)
data_fake = pd.read_csv(file_path_false, encoding="utf-8")

In [None]:
# Displaying the datasets head
data_true.head()

In [None]:
data_fake.head()

In [None]:
# Merging both the Datasets
data = pd.concat([data_true, data_fake], axis=0)

In [None]:
# Displaying Dataset Information
data.info()

# **Exploratatry Data Analysis**

# **Identifying Categorical and Numerical Columns**

In [None]:
# Identifying Categorical and Numerical Columns
categorical_columns = data.select_dtypes(include=['object']).columns
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

# **Categorical Features**

In [None]:
# Displaying the Categorical Features
print("\nCategorical Features:\n")
print(categorical_columns)

**Unique Values and Their Counts Relevant to Each categorical Feature**

In [None]:
# Displaying the Unique Values and Their Counts Relevant to Each Categorical Column
print("Unique values and their count relevant to each categorical feature:\n")
for column in categorical_columns:
    unique_values = data[column].unique()
    value_counts = data[column].value_counts()
    print(value_counts)
    print(" ")

# **Numerical Features**

In [None]:
# Displaying the Numerical Features
print("\nNumerical Features:\n")
print(numerical_columns)

# **Visualization of the Target Variable**

In [None]:
# Visualization of the Distribution of the Target Variable
sns.countplot(data=data, x='status')
plt.title('Target Variable Distribution')
plt.show()

# **Data Preprocessing**

**Handling the null values in the dataset.**

In [None]:
# Step 1: Dropping the null values in the text column
data.dropna(subset=['text'], inplace=True)

In [None]:
# Step 2: Dropping the 'subject' column
data.drop(columns=['subject'], inplace=True)

In [None]:
# Displaying Dataset Information
data.info()

# **Categorical Columns**

# **Data Encoding**

**Applying Label Encoding for the
Categorical Columns**

In [None]:
# Step 2: Encoding the Categorical Column
from sklearn.preprocessing import LabelEncoder
import pickle
import os

# Google Drive Path
save_path = "/content/drive/MyDrive/Multi-Type-Spam-Detection/Encoders/"

# Listing the columns for encoding
encoding_columns = ['status']

# Encoding the categorical features
label_encoders = {}

for feature in encoding_columns:
    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])
    label_encoders[feature] = label_encoder

# Check if directory exists, if not, create it
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save the encoders
encoder_file = os.path.join(save_path, 'label_encoders.pkl')
with open(encoder_file, 'wb') as file:
    pickle.dump(label_encoders, file)

print(f"Label Encoding Applied and Encoders Saved Successfully at: {encoder_file} 🎯")

In [None]:
# TF-IDF for 'title'
tfidf_title = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
df_title_tfidf = tfidf_title.fit_transform(df['title'])

# Save the title vectorizer
with open("/content/drive/MyDrive/Multi-Type-Spam-Detection/Encoders/tfidf_title.pkl", "wb") as f:
    pickle.dump(tfidf_title, f)

# TF-IDF for 'text'
tfidf_text = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
df_text_tfidf = tfidf_text.fit_transform(df['text'])

# Save the text vectorizer
with open("/content/drive/MyDrive/Multi-Type-Spam-Detection/Encoders/tfidf_text.pkl", "wb") as f:
    pickle.dump(tfidf_text, f)

In [None]:
# Displaying dataset Information
data.info()

In [None]:
# Step 3: Splitting the Target Variables and the Features
X = data.drop(columns=['status'])
y = data['status']

In [None]:
# Step 4: Standardizing the Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Step 4: Splitting the Training and the Testing the Datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **The Target Variable**

In [None]:
# Displaying the Distribution of the 'status' Variable
y.value_counts(normalize=True)

In [None]:
# Visualizing the Distribution of the Target Variable
sns.countplot(data=data, x='status')
plt.title('Target Variable Distribution')
plt.show()

# **Building the Model**

## **Model Based on Gradient Boost**

In [None]:
# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

In [None]:
xgb = XGBClassifier(random_state=42)
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print("Best Hyperparameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
# Cross-Validation Score
cv_score = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Score: {np.mean(cv_score):.4f}")

In [None]:
# Model Training
best_model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = best_model.predict(X_test)

In [None]:
#Evaluation Metrics
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Feature Importance Visualization
plt.figure(figsize=(12, 6))
plt.barh(data.columns[:-1], best_model.feature_importances_)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

In [None]:
import joblib

# Saving the Model
joblib.dump(best_model, '/content/drive/MyDrive/Multi-Type-Spam-Detection/Models/news_article_model.pkl')
print("Model Saved Successfully ✅")

# Saving the Scaler (if you're using StandardScaler or MinMaxScaler)
joblib.dump(scaler, '/content/drive/MyDrive/Multi-Type-Spam-Detection/Scaler/news_article_scaler.pkl')
print("Scaler Saved Successfully ✅")

In [None]:
# Loading the Model
best_model_loaded = joblib.load('/content/drive/MyDrive/Multi-Type-Spam-Detection/Encoders/news_article_model.pkl')
print("Model Loaded Successfully ✅")

# Loading the Scaler
scaler_loaded = joblib.load('/content/drive/MyDrive/Multi-Type-Spam-Detection/Scaler/news_article_scaler.pkl')
print("Scaler Loaded Successfully ✅")

In [None]:
import joblib
import numpy as np
import pickle

# Load the saved model, scaler, and encoders
tfidf_title_path = '/content/drive/MyDrive/Multi-Type-Spam-Detection/Encoders/tfidf_title.pkl'
tfidf_text_path = '/content/drive/MyDrive/Multi-Type-Spam-Detection/Encoders/tfidf_text.pkl'
encoder_path = '/content/drive/MyDrive/Multi-Type-Spam-Detection/Encoders/label_encoder.pkl'

# Load the trained model and scaler
model = joblib.load(model_path)
scaler = joblib.load(scaler_path)

# Load the saved TF-IDF vectorizers
with open(tfidf_title_path, 'rb') as f:
    tfidf_title = pickle.load(f)

with open(tfidf_text_path, 'rb') as f:
    tfidf_text = pickle.load(f)

# Load the label encoder for 'status'
with open(encoder_path, 'rb') as f:
    label_encoders = pickle.load(f)
status_encoder = label_encoders['status']

# Preprocessing function (apply saved transformations)
def preprocess_input(title, text, status):
    # Transform text data using the saved TF-IDF vectorizers
    title_tfidf = tfidf_title.transform([title]).toarray()
    text_tfidf = tfidf_text.transform([text]).toarray()

    # Encode categorical 'status' feature using saved label encoder
    status_encoded = status_encoder.transform([status])[0]

    # Combine features
    input_features = np.hstack([title_tfidf, text_tfidf, [status_encoded]])

    # Apply the scaler
    input_features = scaler.transform(input_features.reshape(1, -1))

    return input_features

# Get user input
title = input("Enter the title of the news article: ")
text = input("Enter the text of the news article: ")
status = input("Enter the status of the article (e.g., published, draft, etc.): ")

# Preprocess the input
processed_input = preprocess_input(title, text, status)

# Make the prediction using the trained model
prediction = model.predict(processed_input)

# Output the result
print("This article is SPAM." if prediction == 1 else "This article is NOT spam.")