In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
'''!pip install plotly
!pip install --upgrade nbformat
!pip install nltk
!pip install spacy # spaCy is an open-source software library for advanced natural language processing
!pip install WordCloud
!pip install gensim # Gensim is an open-source library for unsupervised topic modeling and natural language processing
!pip install jupyterthemes
import nltk

'''

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
nltk.download('punkt_tab')
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
# import keras
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Dropout, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split  # Ensure this is imported
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/REVIEW/Pheme_dataset_balanced.csv')

# Convert 'is_rumor' to numeric and drop NaN values
df['is_rumor'] = pd.to_numeric(df['is_rumor'], errors='coerce')
df = df.dropna()

# Download stopwords from nltk if not already downloaded
nltk.download("stopwords")
stop_words = stopwords.words('english')

# Text cleaning function
def clean_text(text):
    # Remove URLs and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    return text.lower()

# Apply cleaning function to the 'text' column
df['text'] = df['text'].apply(clean_text)

# Preprocess function to remove stopwords and short words
def preprocess(text):
    result = []
    for token in nltk.word_tokenize(text):
        if token not in stop_words and len(token) > 2:  # Keep words longer than 2 characters
            result.append(token)
    return " ".join(result)

# Apply preprocess function to get cleaned text
df['cleaned_text'] = df['text'].apply(preprocess)

# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

# Pad sequences to ensure uniform length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Prepare labels
labels = df['is_rumor'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Output shape of the training data for verification
print(f"Training data shape: {X_train.shape}, Labels shape: {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, Labels shape: {y_test.shape}")


In [None]:
embedding_index = {}
with open('/content/drive/My Drive/Colab Notebooks/REVIEW/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create an embedding matrix for the words in our tokenizer
embedding_dim = 100
total_words = len(tokenizer.word_index) + 1  # Adding 1 for padding index (0)
embedding_matrix = np.zeros((total_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < total_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=50)
predictions_prob = model.predict(X_test)
predictions = [1 if prob > 0.5 else 0 for prob in predictions_prob]
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score

accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy: ", accuracy)
print(classification_report(y_test, predictions))

precision = precision_score(y_test, predictions)
print("Precision:", precision)

f1 = f1_score(y_test, predictions)
print("F1-Score:", f1)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score
import matplotlib.pyplot as plt

pred = model.predict(X_test)
prediction = []
for i in range(len(pred)):
    if pred[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

y_test_list = y_test.tolist()
prediction = prediction[:len(y_test_list)]  # Ensure sizes match

false_positives = []
false_negatives = []

for i in range(len(y_test_list)):
    if prediction[i] == 1 and y_test_list[i] == 0:
        false_positives.append((i, X_test[i]))
    elif prediction[i] == 0 and y_test_list[i] == 1:
        false_negatives.append((i, X_test[i]))

# Print total counts
num_false_positives = len(false_positives)
num_false_negatives = len(false_negatives)

print(f"\nTotal False Positives: {num_false_positives}")
print(f"Total False Negatives: {num_false_negatives}")

# Plotting the results
labels = ['False Positives', 'False Negatives']
counts = [num_false_positives, num_false_negatives]

plt.figure(figsize=(8, 5))
plt.bar(labels, counts, color=['blue', 'red'])
plt.title('Count of False Positives and False Negatives')
plt.ylabel('Count')
plt.xlabel('Classification Error Type')
plt.ylim(0, max(counts) + 100)  # Adjust ylim for better visibility
offset = 10
for i in range(len(counts)):
    plt.text(i, counts[i] + offset, str(counts[i]), ha='center')

plt.show()


In [None]:
# Calculate confusion matrix and plot it
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, predictions)

# Plotting the confusion matrix using seaborn heatmap for better visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Rumor', 'Rumor'], yticklabels=['Not Rumor', 'Rumor'])
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')
plt.show()