In [None]:
import pandas as pd

# Load csv file into a pandas dataframe
df = pd.read_csv('/content/drive/MyDrive/WELFake_Dataset.csv', encoding='latin1')

# Strip leading and trailing whitespace from the 'text' column
df['text'] = df['text'].str.strip()
df = df.dropna(subset=['number', 'title', 'text', 'label'], how='any')

# Drop rows that do not have exactly 4 columns
df = df.drop(df[df.apply(lambda row: len(row.dropna()) != 4, axis=1)].index)

# Convert the `label` column to a numeric type
df['label'] = pd.to_numeric(df['label'], errors='coerce')

# Remove rows with non-numeric values in the `label` column
df = df[pd.notnull(df['label'])]

# Remove any columns named 'Unnamed'
df = df.drop(columns=[col for col in df.columns if 'Unnamed' in col])

# Reset index of the cleaned dataframe to start from 0
df = df.reset_index(drop=True)

# Re-index the "number" column to start from 1 and increment by 1
df['number'] = df.index + 1

# Save the cleaned dataframe as a new csv file
df.to_csv('/content/cleaned_WELFake_Dataset.csv', index=False)

# Download the cleaned CSV file to your local machine
from google.colab import files
files.download('cleaned_WELFake_Dataset.csv')

  df = pd.read_csv('/content/drive/MyDrive/WELFake_Dataset.csv', encoding='latin1')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install langid

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941187 sha256=8a4a7ec724fed8b9f99d4dbcd26548161d41e8b0baafcb2d3dcb31f3e33b7e10
  Stored in directory: /root/.cache/pip/wheels/93/95/a9/c292c9dd8cadb8f2359f1670ff198a40d47167b0be3236e1c8
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6


In [None]:
import pandas as pd
import requests
from langid.langid import LanguageIdentifier, model

# Load the cleaned csv file into a pandas dataframe
df = pd.read_csv('/content/cleaned_WELFake_Dataset.csv')

# Remove any rows where the 'text' column is empty or not in English
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
df = df[df['text'].apply(lambda x: x.strip() != '' and identifier.classify(x)[0] == 'en')]

# Reset index of the filtered dataframe to start from 0
df = df.reset_index(drop=True)

# Save the filtered dataframe as a new csv file
df.to_csv('/content/filtered_WELFake_Dataset.csv', index=False)

# Download the filtered CSV file to your local machine
files.download('filtered_WELFake_Dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Loading the dataset
df = pd.read_csv('/content/filtered_WELFake_Dataset.csv')

# Preprocessing the text
def preprocess_text(text):
    # Tokenizing the text
    words = word_tokenize(text.lower())
    # Removing stop words
    words = [word for word in words if not word in stop_words]
    # Joining the remaining words
    return " ".join(words)

df['processed_text'] = df['text'].apply(preprocess_text)

# Tagging the text
tagged_data = [TaggedDocument(words=gensim.utils.simple_preprocess(title + " " + doc), tags=[i]) for i, (title, doc) in enumerate(zip(df['title'], df['processed_text']))]

# Training the Doc2Vec model
d2v_model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, epochs=50)

# Getting document vectors
X = [d2v_model.docvecs[i] for i in range(len(df))]

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Training and testing the model using logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Evaluating the performance of the model
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1-score:", f1_score(y_test, y_pred_lr))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  X = [d2v_model.docvecs[i] for i in range(len(df))]


Accuracy: 0.9028233281049479
Precision: 0.9070733104238259
Recall: 0.8987090367428004
F1-score: 0.9028718021805744


In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/filtered_WELFake_Dataset.csv')

# Preprocessing the text
def preprocess_text(text):
    # Tokenizing the text
    words = word_tokenize(text.lower())
    # Removing stop words
    words = [word for word in words if not word in stop_words]
    # Joining the remaining words
    return " ".join(words)

df['processed_text'] = df['text'].apply(preprocess_text)
# Tagging the text
tagged_data = [TaggedDocument(words=gensim.utils.simple_preprocess(title + " " + doc), tags=[i]) for i, (title, doc) in enumerate(zip(df['title'], df['processed_text']))]

# Training the Doc2Vec model
d2v_model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, epochs=50)

# Getting document vectors
X = [d2v_model.docvecs[i] for i in range(len(df))]

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Training and testing the model using XGBoost
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluating the performance of the model
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1-score:", f1_score(y_test, y_pred_xgb))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  X = [d2v_model.docvecs[i] for i in range(len(df))]


Accuracy: 0.898117781263368
Precision: 0.8927872518870562
Recall: 0.9060859696410838
F1-score: 0.899387453354925
