SVV

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
df = pd.read_csv("/content/fake news detection datasets.csv", sep=';', encoding="latin1")
df.columns = df.columns.str.strip()
irrelevant_cols = ['Dataset', 'References', 'Physical news content', 'Rating scale']
df = df.drop(columns=irrelevant_cols, errors='ignore')
numerical_cols = ['Year', 'Size', 'Extraction time']
for col in numerical_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace(r'[^\d.]', '', regex=True)
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col] = df[col].fillna(df[col].median())
categorical_cols = ['Source', 'News Domain', 'Application area', 'Type of disinformation', 'Language', 'Media platform']
for col in categorical_cols:
    if col in df.columns:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
if 'Availability' in df.columns:
    df['Availability'] = df['Availability'].map({'Yes': 1, 'No': 0})
    df['Availability'] = df['Availability'].fillna(0).astype(int)
    print(df['Availability'].unique())
print(df.info())
X = df.drop('Availability', axis=1, errors='ignore')
y = df['Availability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.fillna(0)
print(X_train.dtypes)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
print("Model trained successfully!")
y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

[1 0]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    34 non-null     float64
 1   Source                  34 non-null     int64  
 2   News Domain             34 non-null     int64  
 3   Application area        34 non-null     int64  
 4   Type of disinformation  34 non-null     int64  
 5   Language                34 non-null     int64  
 6   Size                    34 non-null     float64
 7   Media platform          34 non-null     int64  
 8   Spontaneity             27 non-null     object 
 9   Availability            34 non-null     int64  
 10  Extraction time         34 non-null     float64
dtypes: float64(3), int64(7), object(1)
memory usage: 3.0+ KB
None
Year                      float64
Source                      int64
News Domain                 int64
Application area            int64

RNN

In [None]:
print(df.head())
print(df.shape)

Empty DataFrame
Columns: [id, news_url, title, tweet_ids]
Index: []
(0, 4)


In [None]:
print(df.columns)
print(df[['id', 'title']].head())

Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')
Empty DataFrame
Columns: [id, title]
Index: []


In [None]:
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df = df.dropna(subset=['id'])
df['id'] = df['id'].astype(int)
labels = (df['id'] > 500).astype(int)
print(labels.head())
print(labels.shape)

Series([], Name: id, dtype: int64)
(0,)


In [None]:
texts = df['title'].astype(str).values
print(texts[:5])
print(len(texts))

[]
0


In [None]:
print(df.head())
print(df.info())
print(df.shape)

Empty DataFrame
Columns: [id, news_url, title, tweet_ids]
Index: []
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         0 non-null      int64 
 1   news_url   0 non-null      object
 2   title      0 non-null      object
 3   tweet_ids  0 non-null      object
dtypes: int64(1), object(3)
memory usage: 0.0+ bytes
None
(0, 4)


In [None]:
print(df['id'].head())
print(df['id'].isna().sum())

Series([], Name: id, dtype: int64)
0


In [None]:
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df = df.dropna(subset=['id'])
df['id'] = df['id'].astype(int)

In [None]:
print(df['title'].head())
print(df['title'].isna().sum())

Series([], Name: title, dtype: object)
0


In [None]:
df = df.dropna(subset=['title'])

In [None]:
texts = df['title'].astype(str).values
labels = (df['id'] > 500).astype(int)
print("Number of texts:", len(texts))
print("Number of labels:", len(labels))

Number of texts: 0
Number of labels: 0


In [None]:
labels = (df['id'] % 2 == 0).astype(int)

In [None]:
print("Missing values in 'id':", df['id'].isna().sum())
print("Missing values in 'title':", df['title'].isna().sum())
print(df[['id', 'title']].head())

Missing values in 'id': 0
Missing values in 'title': 0
Empty DataFrame
Columns: [id, title]
Index: []


In [None]:
df = df.dropna(subset=['id', 'title'])

In [None]:
texts = df['title'].astype(str).values
labels = (df['id'] > 500).astype(int)
print(f"Number of texts: {len(texts)}")
print(f"Number of labels: {len(labels)}")
if len(texts) == 0 or len(labels) == 0:
    print("Texts or labels are empty.")
    print(df.head())

Number of texts: 0
Number of labels: 0
Texts or labels are empty.
Empty DataFrame
Columns: [id, news_url, title, tweet_ids]
Index: []


LSTM

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
sequence_length = 10
vocabulary_size = 50
batch_size = 100
X = np.random.randint(1, vocabulary_size, size=(batch_size, sequence_length))
y = np.random.randint(0, 2, size=(batch_size,))
model = Sequential([
    Embedding(input_dim=vocabulary_size, output_dim=32, input_length=sequence_length),
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
model.fit(X, y, epochs=5, batch_size=32, verbose=1)
new_data = np.random.randint(1, vocabulary_size, size=(1, sequence_length))
prediction = model.predict(new_data)
print("Prediction:", prediction)



Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.6263 - loss: 0.6919
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5985 - loss: 0.6871 
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6172 - loss: 0.6805 
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5412 - loss: 0.6835  
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6016 - loss: 0.6705
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
Prediction: [[0.4544903]]


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
data = pd.read_csv("/content/gossipcop_fake.csv")
titles = data['title'].astype(str)
labels = np.random.randint(0, 2, len(titles))
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
max_len = 50
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
predictions = model.predict(X_test)

Epoch 1/10




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.4913 - loss: 0.6946 - val_accuracy: 0.4939 - val_loss: 0.6933
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step - accuracy: 0.4924 - loss: 0.6947 - val_accuracy: 0.4939 - val_loss: 0.6933
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 63ms/step - accuracy: 0.5046 - loss: 0.6939 - val_accuracy: 0.5061 - val_loss: 0.6945
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 67ms/step - accuracy: 0.5161 - loss: 0.6936 - val_accuracy: 0.5061 - val_loss: 0.6931
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step - accuracy: 0.5004 - loss: 0.6942 - val_accuracy: 0.5061 - val_loss: 0.6931
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.5052 - loss: 0.6933 - val_accuracy: 0.5061 - val_loss: 0.6940
Epoch 7/10
[1m134/134[0m [

GRU(Gated Recurrent Units)

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
data = pd.read_csv("/content/gossipcop_fake.csv")
titles = data['title'].astype(str)
labels = np.random.randint(0, 2, len(titles))
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
max_len = 50
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
predictions = model.predict(X_test)

Epoch 1/10




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 63ms/step - accuracy: 0.4839 - loss: 0.6945 - val_accuracy: 0.5042 - val_loss: 0.6931
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 66ms/step - accuracy: 0.4880 - loss: 0.6950 - val_accuracy: 0.4958 - val_loss: 0.6932
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 49ms/step - accuracy: 0.4993 - loss: 0.6937 - val_accuracy: 0.5042 - val_loss: 0.6935
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 83ms/step - accuracy: 0.4959 - loss: 0.6944 - val_accuracy: 0.5042 - val_loss: 0.6935
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 54ms/step - accuracy: 0.5070 - loss: 0.6930 - val_accuracy: 0.4958 - val_loss: 0.6933
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.5163 - loss: 0.6937 - val_accuracy: 0.5042 - val_loss: 0.6933
Epoch 7/10
[1m134/134[0m 

Bidirectional

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
data = pd.read_csv("/content/gossipcop_fake.csv")
titles = data['title'].astype(str)
labels = np.random.randint(0, 2, len(titles))
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
max_len = 50
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
predictions = model.predict(X_test)

Epoch 1/10




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 76ms/step - accuracy: 0.4929 - loss: 0.6947 - val_accuracy: 0.5005 - val_loss: 0.6932
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 119ms/step - accuracy: 0.4738 - loss: 0.6955 - val_accuracy: 0.4995 - val_loss: 0.6942
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 112ms/step - accuracy: 0.4793 - loss: 0.6953 - val_accuracy: 0.5005 - val_loss: 0.6933
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 115ms/step - accuracy: 0.4937 - loss: 0.6935 - val_accuracy: 0.4995 - val_loss: 0.6934
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 124ms/step - accuracy: 0.4867 - loss: 0.6949 - val_accuracy: 0.4995 - val_loss: 0.6932
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 57ms/step - accuracy: 0.5024 - loss: 0.6932 - val_accuracy: 0.5005 - val_loss: 0.6932
Epoch 7/10
[1m134/134

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
data = pd.read_csv("/content/gossipcop_fake.csv")
titles = data['title'].astype(str)
labels = np.random.randint(0, 2, len(titles))
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
max_len = 50
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
predictions = model.predict(X_test)



Epoch 1/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 108ms/step - accuracy: 0.4990 - loss: 0.6937 - val_accuracy: 0.4939 - val_loss: 0.6952
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 98ms/step - accuracy: 0.5819 - loss: 0.6753 - val_accuracy: 0.4948 - val_loss: 0.7195
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 103ms/step - accuracy: 0.7668 - loss: 0.5119 - val_accuracy: 0.4948 - val_loss: 0.8833
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 98ms/step - accuracy: 0.8848 - loss: 0.2915 - val_accuracy: 0.5052 - val_loss: 1.0048
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 122ms/step - accuracy: 0.9204 - loss: 0.1995 - val_accuracy: 0.4995 - val_loss: 1.3768
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 100ms/step - accuracy: 0.9317 - loss: 0.1666 - val_accuracy: 0.5042 - val_loss: 1.5005
Epoch 7/10


Stacked RNN

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
data = pd.read_csv("/content/gossipcop_fake.csv")
titles = data['title'].astype(str)
labels = np.random.randint(0, 2, len(titles))
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
max_len = 50
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
predictions = model.predict(X_test)



Epoch 1/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 101ms/step - accuracy: 0.5101 - loss: 0.6944 - val_accuracy: 0.5139 - val_loss: 0.6928
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 159ms/step - accuracy: 0.5041 - loss: 0.6937 - val_accuracy: 0.5139 - val_loss: 0.6928
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 140ms/step - accuracy: 0.4971 - loss: 0.6935 - val_accuracy: 0.5139 - val_loss: 0.6928
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 93ms/step - accuracy: 0.5007 - loss: 0.6937 - val_accuracy: 0.5139 - val_loss: 0.6928
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 93ms/step - accuracy: 0.5059 - loss: 0.6933 - val_accuracy: 0.4861 - val_loss: 0.6974
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 95ms/step - accuracy: 0.4942 - loss: 0.6948 - val_accuracy: 0.5139 - val_loss: 0.6928
Epoch 7/10
[

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
data = pd.read_csv("/content/gossipcop_fake.csv")
titles = data['title'].astype(str)
labels = np.random.randint(0, 2, len(titles))
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
max_len = 50
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
predictions = model.predict(X_test)

Epoch 1/10




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 111ms/step - accuracy: 0.4917 - loss: 0.6946 - val_accuracy: 0.5202 - val_loss: 0.6924
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 104ms/step - accuracy: 0.5059 - loss: 0.6947 - val_accuracy: 0.4798 - val_loss: 0.6939
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 115ms/step - accuracy: 0.5182 - loss: 0.6930 - val_accuracy: 0.4798 - val_loss: 0.6932
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 108ms/step - accuracy: 0.4948 - loss: 0.6938 - val_accuracy: 0.4798 - val_loss: 0.6959
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 110ms/step - accuracy: 0.5157 - loss: 0.6929 - val_accuracy: 0.4798 - val_loss: 0.6935
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 106ms/step - accuracy: 0.5156 - loss: 0.6932 - val_accuracy: 0.4798 - val_loss: 0.6954
Epoch 7/10
[1m134/13

In [None]:
pip install tensorflow numpy pandas



Graph Sequence Modeling

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
data = pd.read_csv("/content/gossipcop_fake.csv")
data['title'] = data['title'].fillna('')
data['title'] = data['title'].astype(str)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['title'])
title_sequences = tokenizer.texts_to_sequences(data['title'])
tweet_ids_sequences = tokenizer.texts_to_sequences(data['tweet_ids'].fillna(''))
max_length = 50
title_padded = pad_sequences(title_sequences, maxlen=max_length, padding='post')
tweet_ids_padded = pad_sequences(tweet_ids_sequences, maxlen=max_length, padding='post')
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [None]:
pip install networkx node2vec gensim tensorflow pandas

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0


In [None]:
import networkx as nx
from node2vec import Node2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import numpy as np

In [None]:
data = pd.read_csv("/content/gossipcop_fake.csv")

In [None]:
data['title'] = data['title'].fillna('')
data['tweet_ids'] = data['tweet_ids'].fillna('')

In [None]:
graph = nx.Graph()

In [None]:
for _, row in data.iterrows():
    id_node = f"id_{row['id']}"
    graph.add_node(id_node, type="id")

In [None]:
graph.add_node(row['news_url'], type="news_url")
graph.add_edge(id_node, row['news_url'])

In [None]:
for word in row['title'].split():
    graph.add_node(word, type="word")
    graph.add_edge(id_node, word)

In [None]:
for tweet_id in row['tweet_ids'].split(','):
    tweet_id = tweet_id.strip()
    if tweet_id:
        graph.add_node(tweet_id, type="tweet_id")
        graph.add_edge(id_node, tweet_id)

In [None]:
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)
node2vec_model = node2vec.fit(window=10, min_count=1, batch_words=4)
embeddings = {node: node2vec_model.wv[node] for node in graph.nodes()}
sequence_embeddings = []
for _, row in data.iterrows():
    id_embedding = embeddings.get(f"id_{row['id']}")
    url_embedding = embeddings.get(row['news_url'])

Computing transition probabilities:   0%|          | 0/5342 [00:00<?, ?it/s]

In [None]:
title_embeddings = [embeddings[word] for word in row['title'].split() if word in embeddings]
if title_embeddings:
    title_embedding = np.mean(title_embeddings, axis=0)
else:
    title_embedding = np.zeros(64)
tweet_embeddings = [embeddings[tweet_id.strip()] for tweet_id in row['tweet_ids'].split(',') if tweet_id.strip() in embeddings]
if tweet_embeddings:
    tweet_embedding = np.mean(tweet_embeddings, axis=0)
else:
    tweet_embedding = np.zeros(64)
combined_embedding = np.concatenate((id_embedding, url_embedding, title_embedding, tweet_embedding))
sequence_embeddings.append(combined_embedding)
sequence_embeddings = np.array(sequence_embeddings)
model = Sequential()
model.add(LSTM(128, input_shape=(sequence_embeddings.shape[1], 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
dummy_labels = np.random.randint(0, 2, size=(len(sequence_embeddings),))
sequence_embeddings_reshaped = sequence_embeddings.reshape(sequence_embeddings.shape[0], sequence_embeddings.shape[1], 1)
model.fit(sequence_embeddings_reshaped, dummy_labels, epochs=10, batch_size=32)

  super().__init__(**kwargs)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 1.0000 - loss: 0.6904
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 1.0000 - loss: 0.6675
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step - accuracy: 1.0000 - loss: 0.6493
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step - accuracy: 1.0000 - loss: 0.6218
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 1.0000 - loss: 0.5888
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step - accuracy: 1.0000 - loss: 0.5532
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step - accuracy: 1.0000 - loss: 0.5081
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step - accuracy: 1.0000 - loss: 0.4459
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x785924566b00>

STACKED LSTM with fine-tuning

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
data = pd.read_csv("/content/gossipcop_fake.csv")
data['label'] = data['news_url'].apply(lambda x: 1 if 'some_condition' in str(x) else 0)
texts = data['title'].astype(str).tolist()
labels = data['label'].tolist()
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
max_len = 100
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)
y_train = np.array(y_train)
y_test = np.array(y_test)
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']
history = model.fit(
    X_train, y_train, validation_split=0.1, epochs=10, batch_size=16
)
y_pred = (model.predict(X_test) > 0.5).astype("int32")



Epoch 1/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 166ms/step - accuracy: 0.9827 - loss: 0.0878 - val_accuracy: 1.0000 - val_loss: 4.8013e-05
Epoch 2/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 162ms/step - accuracy: 1.0000 - loss: 4.0854e-05 - val_accuracy: 1.0000 - val_loss: 8.8904e-06
Epoch 3/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 162ms/step - accuracy: 1.0000 - loss: 1.0510e-05 - val_accuracy: 1.0000 - val_loss: 3.4112e-06
Epoch 4/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 161ms/step - accuracy: 1.0000 - loss: 4.6831e-06 - val_accuracy: 1.0000 - val_loss: 1.7518e-06
Epoch 5/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 162ms/step - accuracy: 1.0000 - loss: 2.5330e-06 - val_accuracy: 1.0000 - val_loss: 1.0409e-06
Epoch 6/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 165ms/step - accuracy: 1.0000 - loss: 1.8285e-06 - val_accura

In [3]:
print("Training labels distribution:", np.bincount(y_train))
print("Test labels distribution:", np.bincount(y_test))

Training labels distribution: [4258]
Test labels distribution: [1065]


In [4]:
unique_classes = np.unique(y_test)
target_names = [f"Class {cls}" for cls in unique_classes]
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00      1065

    accuracy                           1.00      1065
   macro avg       1.00      1.00      1.00      1065
weighted avg       1.00      1.00      1.00      1065



LSTM with Dropout

In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
data = pd.read_csv("/content/gossipcop_fake.csv")
texts = data['title'].astype(str).tolist()
data['label'] = data['news_url'].apply(lambda x: 1 if 'some_condition' in str(x) else 0)  # Example logic
labels = data['label'].tolist()
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
max_len = 100
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)
y_train = np.array(y_train)
y_test = np.array(y_test)
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(
    X_train, y_train, validation_split=0.1, epochs=10, batch_size=16
)
y_pred = (model.predict(X_test) > 0.5).astype("int32")
unique_classes = np.unique(y_test)
target_names = [f"Class {cls}" for cls in unique_classes]
print(classification_report(y_test, y_pred, target_names=target_names))

Epoch 1/10




[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 162ms/step - accuracy: 0.9827 - loss: 0.0655 - val_accuracy: 1.0000 - val_loss: 8.6470e-06
Epoch 2/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 161ms/step - accuracy: 1.0000 - loss: 7.5678e-06 - val_accuracy: 1.0000 - val_loss: 9.5121e-07
Epoch 3/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 161ms/step - accuracy: 1.0000 - loss: 1.4057e-06 - val_accuracy: 1.0000 - val_loss: 3.2421e-07
Epoch 4/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 159ms/step - accuracy: 1.0000 - loss: 5.4673e-07 - val_accuracy: 1.0000 - val_loss: 1.5879e-07
Epoch 5/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 157ms/step - accuracy: 1.0000 - loss: 2.8600e-07 - val_accuracy: 1.0000 - val_loss: 9.4444e-08
Epoch 6/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 159ms/step - accuracy: 1.0000 - loss: 1.9116e-07 - val_accuracy: 1.0000 