In [2]:
import pandas as pd
import numpy as np
import re
import joblib
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
import pandas as pd
import numpy as np
import re
import joblib
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Updated Downloads for the latest NLTK version
resources = [
    'stopwords',
    'wordnet',
    'omw-1.4',
    'punkt',
    'punkt_tab',
    'averaged_perceptron_tagger_eng'  # The specific fix for your error
]

for res in resources:
    nltk.download(res)

class TextProcessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.label_encoder = LabelEncoder()
        # Using Bi-grams to capture phrases like "Quantum computing"
        self.tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

    def _get_wordnet_pos(self, word):
        """Map POS tag to format WordNetLemmatizer accepts"""
        # We use the 'eng' version explicitly if needed,
        # but nltk.pos_tag usually handles the mapping internally once downloaded
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN,
                    "V": wordnet.VERB, "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    def clean(self, text):
        # Remove URLs and noise
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # POS-Aware Lemmatization
        tokens = nltk.word_tokenize(text.lower())
        cleaned_tokens = [
            self.lemmatizer.lemmatize(w, self._get_wordnet_pos(w))
            for w in tokens if w not in self.stop_words
        ]
        return " ".join(cleaned_tokens)

# --- Execution ---

raw_data = {
    'text': [
        "The economy is fluctuating significantly in 2024!",
        "Biological researchers are studying the feline genome.",
        "Quantum computing remains a theoretical frontier for scientists.",
        "Domestic cats exhibit predatory instincts even when fed."
    ],
    'target': ['finance', 'science', 'science', 'nature']
}
df = pd.DataFrame(raw_data)

processor = TextProcessor()
print("Starting POS-Aware Lemmatization (High Precision)...")
df['refined_text'] = df['text'].apply(processor.clean)

# Label Encoding
df['encoded_label'] = processor.label_encoder.fit_transform(df['target'])

# TF-IDF Representation
tfidf_matrix = processor.tfidf.fit_transform(df['refined_text'])

# Saving Outputs
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
df.to_csv('final_processed_data.csv', index=False)

print("\nSuccess! Sample of cleaned text:")
print(df[['text', 'refined_text']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Starting POS-Aware Lemmatization (High Precision)...

Success! Sample of cleaned text:
                                                text  \
0  The economy is fluctuating significantly in 2024!   
1  Biological researchers are studying the feline...   
2  Quantum computing remains a theoretical fronti...   
3  Domestic cats exhibit predatory instincts even...   

                                        refined_text  
0                    economy fluctuate significantly  
1          biological researcher study feline genome  
2  quantum compute remains theoretical frontier s...  
3   domestic cat exhibit predatory instinct even fed  
