In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger_eng')
import nltk

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [None]:
# Install NLTK and download necessary resources
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
custom_stopwords = {
    'അത്', 'ആണ്', 'ഒരു', 'എന്ന', 'കൂടെ', 'വീട്', 'അല്ല', 'എല്ലാം', 'ഇത്', 'പിന്നെ', 'ആരും',
    'പോയി', 'ഇവിടെ', 'അവിടെ', 'നല്ല', 'മാത്രം', 'അവന്', 'ആരും', 'നോക്കൂ', 'നീ', 'അവള്',
    'എവിടെ', 'നിനക്ക്', 'വളരെ', 'വന്ന', 'എങ്ങനെ', 'നിന്റെ', 'കാണാൻ', 'ഇവന്റെ', 'പോലെ'
}

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
def preprocess_text(text, stopwords):
    """Preprocess text by cleaning, tokenizing, removing stopwords, and stemming/lemmatizing."""
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    if isinstance(text, str):
        # Lowercase conversion
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)

        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)

        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\u0D00-\u0D7F\s]', '', text)  # Malayalam Unicode range

        # Tokenize
        tokens = word_tokenize(text)

        # Remove custom stopwords
        tokens = [word for word in tokens if word not in stopwords]

        # Remove short words
        tokens = [word for word in tokens if len(word) > 2]

        # Lemmatization
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]

        # Stemming
        tokens = [stemmer.stem(word) for word in tokens]

        # Remove duplicate words
        tokens = list(dict.fromkeys(tokens))

        return ' '.join(tokens)
    return text

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/Fake_News2/Task2/fake_news_classification_mal_train.csv")
train_df.shape

(1900, 3)

In [None]:
test_df = pd.read_excel("/content/drive/MyDrive/Fake_News2/Task2/fake_news_classification_mal_test.xlsx")
test_df.shape

(200, 2)

In [None]:
train_df['News'] = train_df['News'].apply(lambda x: preprocess_text(x, custom_stopwords))
test_df['News'] = test_df['News'].apply(lambda x: preprocess_text(x, custom_stopwords))

In [None]:
train_df.to_csv("/content/drive/MyDrive/Fake_News2/Task2/Preprocessed_datasets/processed_train.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/Fake_News2/Task2/Preprocessed_datasets/processed_test.csv", index=False)