In [12]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split    
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import string, emoji
import numpy as np
import pickle

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\VINOTH KUMAR
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\VINOTH KUMAR
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\VINOTH KUMAR
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VINOTH KUMAR M\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [14]:
headers = ['tweet', 'label']

df = pd.read_csv('dataset.csv')
df.to_csv("./data.csv", index=False)
df.head()

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,real
1,States reported 1121 deaths a small rise from ...,real
2,Politically Correct Woman (Almost) Uses Pandem...,fake
3,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,Populous states can generate large case counts...,real


In [15]:
def _pos_tag_to_wordnet(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

In [16]:
from wordsegment import load, segment
import re

load()

def replace_hashtag(text):
    def _split_hash(match):
        h = match.group(0)[1:]  # remove '#'
        h_clean = re.sub(r'[^a-zA-Z]', '', h)  # keep only letters
        if not h_clean:
            return ""
        try:
            pieces = segment(h_clean)
            return " ".join(pieces) if pieces else h_clean
        except ValueError:
            # If segmentation fails, just return original cleaned hashtag
            return h_clean
    return re.sub(r'#\w+', _split_hash, text)

In [17]:

def preprocess_text_improved(text, remove_stopwords=True, lemmatize=True):
    """
    Returns a cleaned string.
    - replace URLs, mentions
    - demojize emojis (emoji -> :smile:)
    - expand hashtags into words
    - normalize repeated characters
    - replace numbers with <NUM>
    - remove punctuation (keeps internal apostrophes removed)
    - lower, tokenize, optional stopword removal and lemmatize
    """
    if not isinstance(text, str):
        return ""
    # 1. lower
    text = text.lower()

    # 2. urls and mentions
    text = re.sub(r'http\S+|www\.\S+', ' ', text)          # remove URLs
    text = re.sub(r'@\w+', ' ', text)                     # remove @mentions

    # 3. demojize (turn emoji into textual token like :smile:)
    text = emoji.demojize(text, delimiters=(" ", " "))    # "🙂" -> " :slightly_smiling_face: "
    
    # 4. hashtags -> split into words
    text = replace_hashtag(text)

    # 5. replace digits with <NUM>
    text = re.sub(r'\d+(?:[\.,]\d+)*', ' <NUM> ', text)

    # 6. normalize elongated characters: reduce 3+ repeats to 2 (so looove -> loove)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    # 7. remove punctuation (keep spaces). If you want to keep emoticon punctuation, adjust.
    # Remove punctuation except the placeholder tokens like :smile: and <NUM>
    # First preserve coloned tokens (e.g. :smile:) by temporary marker
    text = re.sub(r':([a-z0-9_+-]+):', r' EMOJI_\1_EMO ', text)
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    text = re.sub(r'\s+', ' ', text).strip()
    # restore emoji tokens
    text = re.sub(r'EMOJI_([a-z0-9_+-]+)_EMO', r':\1:', text)

    if text == "":
        return ""

    # 8. tokenize
    tokens = word_tokenize(text)

    # 9. optional stopword removal, but keep <NUM> and emoji tokens
    stop_words = set(stopwords.words('english'))
    if remove_stopwords:
        filtered = []
        for t in tokens:
            if t == '<NUM>' or (t.startswith(':') and t.endswith(':')):
                filtered.append(t)
            elif t not in stop_words:
                filtered.append(t)
        tokens = filtered

    # 10. lemmatize with POS
    lemmatizer = WordNetLemmatizer()
    if lemmatize:
        pos_tags = pos_tag(tokens)
        lem_tokens = []
        for tok, tag in pos_tags:
            wn_tag = _pos_tag_to_wordnet(tag)
            tok_lem = lemmatizer.lemmatize(tok, wn_tag)
            lem_tokens.append(tok_lem)
        tokens = lem_tokens

    # 11. final join
    cleaned = " ".join(tokens)
    return cleaned

In [18]:
def preprocess_dataframe(df, text_column='tweet', label_column='label'):
    """
    Adds 'clean_text' column (or overwrite) and drops rows with empty cleaned text.
    Returns cleaned dataframe and (optionally) label encoder if needed.
    """
    df = df.copy()
    df[text_column] = df[text_column].apply(preprocess_text_improved)
    # drop rows where cleaned text is empty
    before = len(df)
    df = df[df[text_column].str.strip().astype(bool)].reset_index(drop=True)
    after = len(df)
    print(f"Preprocessing: dropped {before-after} empty rows (out of {before}).")
    return df

In [19]:
X = df['tweet']
y = df['label']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1, random_state=42)
tfidf_vectorizer =  TfidfVectorizer(
    ngram_range=(1, 2),       # unigrams, bigrams, trigrams
    min_df=5,                 # ignore words in <3 documents
    max_df=0.7,               # ignore words in >80% of documents
    sublinear_tf=True,        # logarithmic term frequency scaling
    max_features=2000,       # limit features to top 20k by term frequency
    norm='l2'                 # L2 normalization (good for linear models)
)
train_tfidf = tfidf_vectorizer.fit(df['tweet'])

In [20]:
le = LabelEncoder()
y_encoded = le.fit(Y_train)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [21]:
dir = './split'
os.makedirs(dir, exist_ok=True)
train_df = pd.concat([X_train, Y_train], axis=1)
test_df = pd.concat([X_test, Y_test], axis=1)
test_df = preprocess_dataframe(test_df)
test_df.to_csv('./split/test.csv', index=False)
train_df = preprocess_dataframe(train_df)
train_df.to_csv('./split/train.csv')
train_df_encoded = tfidf_vectorizer.transform(train_df['tweet'])
train_df_encoded = pd.DataFrame(train_df_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
train_df_encoded.to_csv('./split/train_df_encoded.csv', index=False)
print(train_df['label'].value_counts())

Preprocessing: dropped 0 empty rows (out of 1060).
Preprocessing: dropped 0 empty rows (out of 9540).
label
real    5002
fake    4538
Name: count, dtype: int64


In [22]:
size = train_df.shape[0] // 5
dir = './tfidf'
dir1 = './preprocess'
os.makedirs(dir, exist_ok=True)
os.makedirs(dir1, exist_ok=True)
# Shuffle the dataset once
shuffled_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into 4 equal parts (last split may have 1 extra row if not divisible)
splits = np.array_split(shuffled_df, 4)

for i, split_df in enumerate(splits, start=1):
    # Save raw text
    split_df.to_csv(f'./split/train_{i}.csv', index=False)
    
    # Transform to TF-IDF
    tfidf_matrix = tfidf_vectorizer.transform(split_df['tweet'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),
                            columns=tfidf_vectorizer.get_feature_names_out())
    
    tfidf_df.to_csv(f'./tfidf/train_{i}.csv', index=False)

test_df = pd.read_csv('./split/test.csv')
test_df = tfidf_vectorizer.transform(test_df['tweet'])
test_df = pd.DataFrame(test_df.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
test_df.to_csv('./tfidf/test.csv', index=False)

  return bound(*args, **kwds)
