In [4]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import unicodedata
import re
nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [2]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

# Load true and fake news datasets
true_news = pd.read_csv('/content/drive/MyDrive/fake news detection/True.csv')
fake_news = pd.read_csv('/content/drive/MyDrive/fake news detection/Fake.csv')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# Add labels
true_news['label'] = 1  # Label 1 for true news
fake_news['label'] = 0  # Label 0 for fake news

# Combine datasets
data = pd.concat([true_news, fake_news], ignore_index=True)

# Preprocessing function
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Feature extraction: TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_text'])

# Display preprocessed text and TF-IDF features
print("\nPreprocessed Text Example:\n", data['cleaned_text'].iloc[0])
print("\nTF-IDF Features (First Sample):\n", tfidf_features[0].toarray())
print("\nTF-IDF Vocabulary:\n", tfidf_vectorizer.get_feature_names_out())


Preprocessed Text Example:
 washington reuters the head of a conservative republican faction in the u s congress who voted this month for a huge expansion of the national debt to pay for tax cut called himself a fiscal conservative on sunday and urged budget restraint in in keeping with a sharp pivot under way among republican u s representative mark meadow speaking on cbs face the nation drew a hard line on federal spending which lawmaker are bracing to do battle over in january when they return from the holiday on wednesday lawmaker will begin trying to pas a federal budget in a fight likely to be linked to other issue such a immigration policy even a the november congressional election campaign approach in which republican will seek to keep control of congress president donald trump and his republican want a big budget increase in military spending while democrat also want proportional increase for non defense discretionary spending on program that support education scientific rese

In [6]:
from sklearn.utils import resample

# Example dataset
import pandas as pd
data = pd.DataFrame({
    'text': ['True news example 1', 'True news example 2', 'Fake news example 1'],
    'label': [1, 1, 0]  # Imbalanced classes
})

# Separate majority and minority classes
majority = data[data.label == 1]
minority = data[data.label == 0]

# Downsampling
downsampled_majority = resample(majority,
                                replace=False,
                                n_samples=len(minority),
                                random_state=42)
balanced_downsampled = pd.concat([downsampled_majority, minority])

# Oversampling
oversampled_minority = resample(minority,
                                replace=True,
                                n_samples=len(majority),
                                random_state=42)
balanced_oversampled = pd.concat([majority, oversampled_minority])


In [7]:
from nltk.corpus import wordnet
import random

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    for _ in range(n):
        word = random.choice(words)
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            sentence = sentence.replace(word, synonym, 1)
    return sentence

nltk.download('wordnet')
example_sentence = "The quick brown fox jumps over the lazy dog."
augmented_sentence = synonym_replacement(example_sentence, n=2)
print("Augmented Sentence:", augmented_sentence)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Augmented Sentence: The quick brown fox jumps over the lazy dog.


In [8]:
def random_swap(sentence, n=1):
    words = sentence.split()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

augmented_sentence = random_swap(example_sentence, n=2)
print("Augmented Sentence:", augmented_sentence)


Augmented Sentence: The quick jumps fox brown dog. the lazy over


In [9]:
from transformers import pipeline

# Using Hugging Face for back translation
translator = pipeline('translation_en_to_fr', model='t5-small')
translated = translator("The quick brown fox jumps over the lazy dog.")[0]['translation_text']

back_translator = pipeline('translation_fr_to_en', model='t5-small')
back_translated = back_translator(translated)[0]['translation_text']

print("Back Translated Sentence:", back_translated)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Your input_length: 20 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Back Translated Sentence: Le renard brun rapide saute au-dessus du chien paresseux.


In [10]:
slang_dict = {
    "u": "you",
    "r": "are",
    "gr8": "great",
    "idk": "I don't know"
}

def replace_slang(text):
    words = text.split()
    return ' '.join([slang_dict[word] if word in slang_dict else word for word in words])

example_slang = "u r gr8"
cleaned_text = replace_slang(example_slang)
print("Cleaned Text:", cleaned_text)


Cleaned Text: you are great


In [12]:
!pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [13]:
from spellchecker import SpellChecker


In [14]:


spell = SpellChecker()

def correct_spelling(text):
    words = text.split()
    corrected = [spell.correction(word) for word in words]
    return ' '.join(corrected)

misspelled_text = "Ths is an exmple."
corrected_text = correct_spelling(misspelled_text)
print("Corrected Text:", corrected_text)


Corrected Text: the is an example


In [15]:
def handle_numerics(text, mode="remove"):
    if mode == "remove":
        return re.sub(r'\d+', '', text)
    elif mode == "replace":
        return re.sub(r'\d+', '<NUM>', text)
    else:  # Keep numerics
        return text

example_text = "I have 3 apples and 2 oranges."
text_with_removed_numerics = handle_numerics(example_text, mode="remove")
text_with_replaced_numerics = handle_numerics(example_text, mode="replace")
print("Removed Numerics:", text_with_removed_numerics)
print("Replaced Numerics:", text_with_replaced_numerics)


Removed Numerics: I have  apples and  oranges.
Replaced Numerics: I have <NUM> apples and <NUM> oranges.


In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

sentence = "John is working on a machine learning project."
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


POS Tags: [('John', 'NNP'), ('is', 'VBZ'), ('working', 'VBG'), ('on', 'IN'), ('a', 'DT'), ('machine', 'NN'), ('learning', 'NN'), ('project', 'NN'), ('.', '.')]


In [17]:
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

text = "Barack Obama was the 44th president of the United States and lived in Washington D.C."
doc = nlp(text)

for entity in doc.ents:
    print(f"Entity: {entity.text}, Type: {entity.label_}")


Entity: Barack Obama, Type: PERSON
Entity: 44th, Type: ORDINAL
Entity: the United States, Type: GPE
Entity: Washington D.C., Type: GPE


In [18]:
for token in doc:
    print(f"Word: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")


Word: Barack, Dependency: compound, Head: Obama
Word: Obama, Dependency: nsubj, Head: was
Word: was, Dependency: ROOT, Head: was
Word: the, Dependency: det, Head: president
Word: 44th, Dependency: amod, Head: president
Word: president, Dependency: attr, Head: was
Word: of, Dependency: prep, Head: president
Word: the, Dependency: det, Head: States
Word: United, Dependency: compound, Head: States
Word: States, Dependency: pobj, Head: of
Word: and, Dependency: cc, Head: was
Word: lived, Dependency: conj, Head: was
Word: in, Dependency: prep, Head: lived
Word: Washington, Dependency: compound, Head: D.C.
Word: D.C., Dependency: pobj, Head: in


In [19]:
from nltk import pos_tag, RegexpParser

sentence = "The quick brown fox jumps over the lazy dog."
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)

# Define a chunk grammar
grammar = "NP: {<DT>?<JJ>*<NN>}"  # Noun Phrase: optional determiner, adjectives, and a noun
chunk_parser = RegexpParser(grammar)

tree = chunk_parser.parse(pos_tags)
print(tree)


(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN)
  ./.)


In [20]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import make_classification

# Create a sample dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_samples=1000, random_state=42)

# Oversampling
oversampler = RandomOverSampler(random_state=42)
X_over, y_over = oversampler.fit_resample(X, y)

# Downsampling
undersampler = RandomUnderSampler(random_state=42)
X_under, y_under = undersampler.fit_resample(X, y)


In [22]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Example feature matrix
embeddings = np.array([[0.5, 0.8, 1.2], [0.2, 0.3, 0.6], [0.9, 1.5, 2.3]])

scaler = MinMaxScaler()
normalized_embeddings = scaler.fit_transform(embeddings)
print("Normalized Embeddings:\n", normalized_embeddings)


Normalized Embeddings:
 [[0.42857143 0.41666667 0.35294118]
 [0.         0.         0.        ]
 [1.         1.         1.        ]]
