In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [2]:
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# Sample text for preprocessing
text = "Artificial intelligence is a branch of computer science that aims to create intelligent machines. It has become an essential part of the technology industry."


In [4]:
# Step 1: Tokenize the text
tokens = word_tokenize(text)
print("Tokens:", tokens)


Tokens: ['Artificial', 'intelligence', 'is', 'a', 'branch', 'of', 'computer', 'science', 'that', 'aims', 'to', 'create', 'intelligent', 'machines', '.', 'It', 'has', 'become', 'an', 'essential', 'part', 'of', 'the', 'technology', 'industry', '.']


In [10]:
# Step 2: Remove stop words in a simpler way
stop_words = set(stopwords.words('english'))
filtered_tokens = []

for word in tokens:
    # Convert to lowercase, check if it's a stop word and if it's alphabetic
    if word.lower() not in stop_words and word.isalpha():
        filtered_tokens.append(word)

print("\nAfter Stop Word Removal:", filtered_tokens)


After Stop Word Removal: ['Artificial', 'intelligence', 'branch', 'computer', 'science', 'aims', 'create', 'intelligent', 'machines', 'become', 'essential', 'part', 'technology', 'industry']


In [11]:
# Step 3: Apply stemming in a simpler way
stemmer = PorterStemmer()
stemmed_tokens = []

for word in filtered_tokens:
    stemmed_word = stemmer.stem(word)
    stemmed_tokens.append(stemmed_word)

print("\nAfter Stemming:", stemmed_tokens)


After Stemming: ['artifici', 'intellig', 'branch', 'comput', 'scienc', 'aim', 'creat', 'intellig', 'machin', 'becom', 'essenti', 'part', 'technolog', 'industri']


In [12]:
# Join the processed words back into a single string
processed_text = ' '.join(stemmed_tokens)
print("\nProcessed Text:", processed_text)


Processed Text: artifici intellig branch comput scienc aim creat intellig machin becom essenti part technolog industri
