In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from transformers import BertTokenizer, TFBertModel
from xgboost import XGBClassifier

In [None]:
# Load the datasets
fake_news_df = pd.read_csv('/content/drive/MyDrive/Dataset/Fake.csv')
true_news_df = pd.read_csv('/content/drive/MyDrive/Dataset/True.csv')

# Add labels
fake_news_df['label'] = 0  # Fake news label
true_news_df['label'] = 1  # True news label

# Combine the datasets
df = pd.concat([fake_news_df, true_news_df], ignore_index=True)

# Preprocess the text data (you might want to add more preprocessing steps if necessary)
df['text'] = df['text'].str.lower()

# Handle missing values in the text column
df['text'] = df['text'].fillna('')

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# BERT embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size].tolist()
        inputs = tokenizer(batch_texts, return_tensors='tf', padding=True, truncation=True, max_length=512)
        outputs = bert_model(inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

X_train_bert = get_bert_embeddings(X_train)
X_test_bert = get_bert_embeddings(X_test)

# Apply PCA to reduce dimensionality of BERT embeddings
pca_bert = PCA(n_components=500)
X_train_bert_pca = pca_bert.fit_transform(X_train_bert)
X_test_bert_pca = pca_bert.transform(X_test_bert)

# Combine TF-IDF and BERT embeddings
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

X_train_combined = np.hstack((X_train_tfidf, X_train_bert_pca))
X_test_combined = np.hstack((X_test_tfidf, X_test_bert_pca))

# Dimensionality reduction using PCA
pca = PCA(n_components=500)
X_train_pca = pca.fit_transform(X_train_combined)
X_test_pca = pca.transform(X_test_combined)

# Autoencoder for anomaly detection
input_dim = X_train_pca.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(256, activation="relu")(input_layer)
encoder = Dense(128, activation="relu")(encoder)
decoder = Dense(256, activation="relu")(encoder)
decoder = Dense(input_dim, activation="sigmoid")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train autoencoder
autoencoder.fit(X_train_pca, X_train_pca, epochs=5, batch_size=256, shuffle=True, validation_split=0.2)

# Predict reconstruction error
X_train_pred = autoencoder.predict(X_train_pca)
mse = np.mean(np.power(X_train_pca - X_train_pred, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse, 'true_class': y_train})

# Set a threshold for anomaly detection using Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
y_pred_iso = iso_forest.fit_predict(X_train_pca)
threshold = np.percentile(mse, 95)
X_train_filtered = X_train_pca[(mse <= threshold) & (y_pred_iso == 1)]
y_train_filtered = y_train[(mse <= threshold) & (y_pred_iso == 1)]

# Clustering using K-Means and DBSCAN
kmeans = KMeans(n_clusters=5, random_state=42)
X_train_clusters = kmeans.fit_predict(X_train_filtered)

dbscan = DBSCAN(eps=0.5, min_samples=5)
X_train_clusters_db = dbscan.fit_predict(X_train_filtered)

# Ensemble learning
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)
svm = SVC(kernel='linear', probability=True, random_state=42)
xgb = XGBClassifier(n_estimators=100, random_state=42)

# Train base classifiers on clustered data
rf.fit(X_train_filtered, y_train_filtered)
gb.fit(X_train_filtered, y_train_filtered)
lr.fit(X_train_filtered, y_train_filtered)
svm.fit(X_train_filtered, y_train_filtered)
xgb.fit(X_train_filtered, y_train_filtered)

# Stacking Classifier
estimators = [
    ('rf', rf),
    ('gb', gb),
    ('svm', svm),
    ('xgb', xgb)
]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=lr)

stacking_clf.fit(X_train_filtered, y_train_filtered)

# Test data processing
X_test_pred = autoencoder.predict(X_test_pca)
mse_test = np.mean(np.power(X_test_pca - X_test_pred, 2), axis=1)
y_pred_iso_test = iso_forest.predict(X_test_pca)
X_test_filtered = X_test_pca[(mse_test <= threshold) & (y_pred_iso_test == 1)]

# Predictions
y_pred = stacking_clf.predict(X_test_filtered)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5






In [None]:
# Test data processing
X_test_pred = autoencoder.predict(X_test_pca)
mse_test = np.mean(np.power(X_test_pca - X_test_pred, 2), axis=1)
y_pred_iso_test = iso_forest.predict(X_test_pca)

# Filter BOTH X_test_pca AND y_test to maintain consistency
X_test_filtered = X_test_pca[(mse_test <= threshold) & (y_pred_iso_test == 1)]
y_test_filtered = y_test[(mse_test <= threshold) & (y_pred_iso_test == 1)] # Add this line to filter y_test

# Predictions
y_pred = stacking_clf.predict(X_test_filtered)

# Evaluate the model using the filtered test data
accuracy = accuracy_score(y_test_filtered, y_pred) # Use y_test_filtered
report = classification_report(y_test_filtered, y_pred) # Use y_test_filtered

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.9946025515210991
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4244
           1       1.00      0.99      0.99      3908

    accuracy                           0.99      8152
   macro avg       0.99      0.99      0.99      8152
weighted avg       0.99      0.99      0.99      8152



In [None]:
import joblib
import numpy as np
import tensorflow as tf

# Define the directory where you want to save the files
save_dir = '/content/drive/MyDrive/Data'

# Save models
autoencoder.save(save_dir + 'autoencoder.h5')
joblib.dump(iso_forest, save_dir + 'iso_forest.pkl')
joblib.dump(stacking_clf, save_dir + 'stacking_clf.pkl')

# Save data transformers
joblib.dump(pca_bert, save_dir + 'pca_bert.pkl')
joblib.dump(tfidf_vectorizer, save_dir + 'tfidf_vectorizer.pkl')
joblib.dump(pca, save_dir + 'pca.pkl')
tokenizer.save_pretrained(save_dir + 'tokenizer')

# Save preprocessed data
np.save(save_dir + 'X_train_pca.npy', X_train_pca)
np.save(save_dir + 'X_test_pca.npy', X_test_pca)
np.save(save_dir + 'X_train_filtered.npy', X_train_filtered)
np.save(save_dir + 'y_train_filtered.npy', y_train_filtered)
np.save(save_dir + 'X_test_filtered.npy', X_test_filtered)
np.save(save_dir + 'y_test_filtered.npy', y_test_filtered)


NameError: name 'autoencoder' is not defined

In [None]:
!pip install scikit-learn==1.2.2

Collecting scikit-learn==1.2.2
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.2
    Uninstalling scikit-learn-1.5.2:
      Successfully uninstalled scikit-learn-1.5.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.2 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.2.2


In [None]:
import joblib
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Load the saved objects
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
pca_bert = joblib.load('/content/drive/MyDrive/Datapca_bert.pkl')
tfidf_vectorizer = joblib.load('/content/drive/MyDrive/Datatfidf_vectorizer.pkl')
pca = joblib.load('/content/drive/MyDrive/Datapca.pkl')
autoencoder = tf.keras.models.load_model('/content/drive/MyDrive/Dataautoencoder.h5')
iso_forest = joblib.load('/content/drive/MyDrive/Dataiso_forest.pkl')
stacking_clf = joblib.load('/content/drive/MyDrive/Datastacking_clf.pkl')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
def preprocess_user_input(text):
    # Get BERT embeddings
    inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True, max_length=512)
    outputs = bert_model(inputs)
    bert_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

    # Apply PCA to BERT embeddings
    bert_pca = pca_bert.transform(bert_embeddings)

    # Get TF-IDF vectors
    tfidf_vectors = tfidf_vectorizer.transform([text]).toarray()

    # Combine TF-IDF and BERT embeddings
    combined_features = np.hstack((tfidf_vectors, bert_pca))

    # Apply PCA to combined features
    pca_features = pca.transform(combined_features)

    return pca_features


In [None]:
def detect_and_predict(text):
    # Preprocess the user input
    pca_features = preprocess_user_input(text)

    # Predict reconstruction error using the autoencoder
    reconstructed = autoencoder.predict(pca_features)
    mse = np.mean(np.power(pca_features - reconstructed, 2), axis=1)

    # Use Isolation Forest to detect anomalies
    is_anomaly = iso_forest.predict(pca_features)

    # Filter the input if it's not detected as an anomaly
    if mse <= threshold and is_anomaly == 1:
        prediction = stacking_clf.predict(pca_features)
        return prediction
    else:
        return "Input detected as anomaly"


In [None]:
def preprocess_and_predict(texts):
    # Preprocess texts (same steps as during training)
    texts = [text.lower() for text in texts]

    # Get BERT embeddings
    def get_bert_embeddings(texts, batch_size=32):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='tf', padding=True, truncation=True, max_length=512)
            outputs = bert_model(inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

    bert_embeddings = get_bert_embeddings(texts)
    bert_embeddings_pca = pca_bert.transform(bert_embeddings)

    # Get TF-IDF features
    tfidf_features = tfidf_vectorizer.transform(texts).toarray()

    # Combine TF-IDF and BERT embeddings
    combined_features = np.hstack((tfidf_features, bert_embeddings_pca))

    # Apply PCA
    combined_features_pca = pca.transform(combined_features)

    # Autoencoder for anomaly detection
    reconstructions = autoencoder.predict(combined_features_pca)
    mse = np.mean(np.power(combined_features_pca - reconstructions, 2), axis=1)
    iso_predictions = iso_forest.predict(combined_features_pca)

    # Filter data based on reconstruction error and Isolation Forest
    threshold = np.percentile(mse, 95)
    filtered_indices = (mse <= threshold) & (iso_predictions == 1)
    filtered_features = combined_features_pca[filtered_indices]

    # Check if there are samples left after filtering
    if filtered_features.shape[0] == 0:
        return [], filtered_indices

    # Make predictions using stacking classifier
    predictions = stacking_clf.predict(filtered_features)

    return predictions, filtered_indices

# Example usage
new_texts = [
    " West Bengal is in Australia"
]

predictions, filtered_indices = preprocess_and_predict(new_texts)

for i, (text, pred) in enumerate(zip(new_texts, predictions)):
    if filtered_indices[i]:
        label = "True" if pred == 1 else "Fake"
        print(f"Text: {text}\nPrediction: {label}\n")
    else:
        print(f"Text: {text}\nPrediction: Anomaly detected, no prediction made.\n")




In [None]:
def preprocess_and_predict(texts):
    # Preprocess texts (same steps as during training)
    texts = [text.lower() for text in texts]

    # Get BERT embeddings
    def get_bert_embeddings(texts, batch_size=32):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='tf', padding=True, truncation=True, max_length=512)
            outputs = bert_model(inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

    bert_embeddings = get_bert_embeddings(texts)
    bert_embeddings_pca = pca_bert.transform(bert_embeddings)

    # Get TF-IDF features
    tfidf_features = tfidf_vectorizer.transform(texts).toarray()

    # Combine TF-IDF and BERT embeddings
    combined_features = np.hstack((tfidf_features, bert_embeddings_pca))

    # Apply PCA
    combined_features_pca = pca.transform(combined_features)

    # Autoencoder for anomaly detection
    reconstructions = autoencoder.predict(combined_features_pca)
    mse = np.mean(np.power(combined_features_pca - reconstructions, 2), axis=1)
    iso_predictions = iso_forest.predict(combined_features_pca)

    # Filter data based on reconstruction error and Isolation Forest
    threshold = np.percentile(mse, 95)
    filtered_indices = (mse <= threshold) & (iso_predictions == 1)
    filtered_features = combined_features_pca[filtered_indices]

    # Make predictions using stacking classifier if there are filtered samples
    if filtered_features.shape[0] > 0: # Check if there are any samples left after filtering
        predictions = stacking_clf.predict(filtered_features)
    else:
        predictions = [] # Return an empty list if no samples are left

    return predictions, filtered_indices

# Example usage
new_texts = [
    "World War III started"
]
predictions, filtered_indices = preprocess_and_predict(new_texts)

for i, text in enumerate(new_texts):
    if filtered_indices[i]:
        # Only try to access prediction if it exists
        if predictions:
            pred = predictions[0]
            label = "True" if pred == 1 else "Fake"
            print(f"Text: {text}\nPrediction: {label}\n")
        else:
            print(f"Text: {text}\nPrediction: Anomaly detected, no prediction made.\n")
    else:
        print(f"Text: {text}\nPrediction: Anomaly detected, no prediction made.\n")

Text: World War III started
Prediction: Anomaly detected, no prediction made.

