In [3]:
import nltk

##### Step 1: Download the Data Manually
- Download the neccessary NLTK data files ('punkt, stopwords, etc) from the 
  NLTK data website:
  - NLTK Data: https://www.nltk.org/nltk_data/

##### Step 2: Extract the Data Files

In [1]:
import zipfile
import os

downloads_dir = '/Users/QuangAP/Downloads' # Path of the downloads directory
nltk_data_dir = '/Users/QuangAP/nltk_data'

# List of files to extract
files_to_extract = ['punkt.zip', 'stopwords.zip', 'sentiwordnet.zip', 
                    'words.zip', 'wordnet.zip', 'wordnet31.zip']

for file_name in files_to_extract:
    file_path = os.path.join(downloads_dir, file_name)
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(nltk_data_dir)

##### Step 3: Move the 'stopwords' and 'punkt' folders

In [16]:
import os
import shutil

# Define the paths
nltk_data_dir = '/Users/QuangAP/nltk_data'
corpora_dir = os.path.join(nltk_data_dir, 'corpora')
tokenizers_dir = os.path.join(nltk_data_dir, 'tokenizers')

# Create the necessary directories if they don't exist
os.makedirs(corpora_dir, exist_ok=True)
os.makedirs(tokenizers_dir, exist_ok=True)

# Move the stopwords folder to the corpora directory
stopwords_src = os.path.join(nltk_data_dir, 'stopwords')
stopwords_dst = os.path.join(corpora_dir, 'stopwords')
if os.path.exists(stopwords_src):
    shutil.move(stopwords_src, stopwords_dst)

# Move the punkt folder to the tokenizers directory
punkt_src = os.path.join(nltk_data_dir, 'punkt')
punkt_dst = os.path.join(tokenizers_dir, 'punkt')
if os.path.exists(punkt_src):
    shutil.move(punkt_src, punkt_dst)

print("Files moved successfully!")

Files moved successfully!


##### Step 4: Verify the Setup

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Append the custom nltk_data directory to the nltk data path
nltk.data.path.append('/Users/QuangAP/nltk_data')

# Verify that stopwords and punkt are loaded correctly
try:
    # Load stopwords
    stop_words = set(stopwords.words('english'))
    print("Stopwords loaded successfully!")
    print(f"Sample stopwords: {list(stop_words)[:10]}")
    
    # Tokenize a sample text
    text = "This is a sample sentence, showing off the stop words filtration."
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stop_words]
    print(f"Filtered text: {filtered_text}")
    
    print("Punkt and stopwords datasets are being used from the local directory.")
except Exception as e:
    print(f"An error occurred: {e}")

Stopwords loaded successfully!
Sample stopwords: ["you've", 'as', 'shan', "hasn't", 'after', 'than', 'can', 'each', 'such', 'hasn']
Filtered text: ['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']
Punkt and stopwords datasets are being used from the local directory.


In [4]:
stopwords =set(stopwords.words('english'))
print(stop_words)

{'aren', 'they', 'off', "weren't", 'wasn', 'down', 'on', 'an', "it's", 'our', 'with', 'about', 'shouldn', 'didn', 'a', 'below', "isn't", 'needn', 'doing', 'i', 'myself', "don't", 'can', 'in', 're', 'are', 'few', 'up', 'all', 'through', "you're", 'him', 'not', 'just', 'he', 'll', 'so', "she's", "that'll", 'those', 'and', 'being', 'how', 'yourselves', "wouldn't", 'is', 'as', 'was', 'same', 'which', 'has', "hasn't", 'mustn', 'there', 'for', "doesn't", 'them', "didn't", 'her', 'were', 'his', 'o', 'mightn', 'who', "hadn't", 'further', 'some', 'any', 'been', "shan't", 'did', "needn't", 'above', 'here', 't', 'your', 'it', 'while', 'what', 'other', 'you', 'ourselves', 'd', 'that', 'too', 'into', 'once', 'very', "shouldn't", 'each', 'have', 'm', 'haven', "couldn't", 'hadn', 'doesn', 'to', 'we', 'am', 'most', "haven't", 'than', 'why', 'such', 'both', 'if', 'won', 'himself', 'ain', 'my', 'shan', 'don', 'herself', 'then', 'no', 'before', 'y', 'she', 'couldn', 'should', 'whom', 'more', 'ma', "you'd

### Testing the NLTK Library
##### Tokenization

In [15]:
sentence ="At eight o'cloc on Thursday morning Athur didn't feel very good."
tokens = nltk.word_tokenize(sentence)
tokens

['At',
 'eight',
 "o'cloc",
 'on',
 'Thursday',
 'morning',
 'Athur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [20]:
# Example text
text = "This is a sample sentence, showing off the stop words filtration."

# Lowercasing
text = text.lower()

# Tokenization
tokens = word_tokenize(text)
print(tokens)



['this', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']


##### Removing Stopwords

In [49]:
filtered_text = [word for word in tokens if word.lower() not in stopwords.words('english')]

print(filtered_text)

['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


##### Working with Brown Corpus

In [26]:
from nltk.corpus import brown

# List available categories (genres)
categories = brown.categories()
print(categories)

# Get words from a specific category
words = brown.words(categories='news')
print(words[:20])

# Get sentences from a specific category
sentences = brown.sents(categories='editorial')
print(sentences[:2])

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']]


In [27]:
from nltk import FreqDist

# Get all words from the corpus
all_words = brown.words()

# Calculate frequency distribution
fdist = FreqDist(all_words)

# Print the 10 most common words
print(fdist.most_common(10))

[('the', 62713), (',', 58334), ('.', 49346), ('of', 36080), ('and', 27915), ('to', 25732), ('a', 21881), ('in', 19536), ('that', 10237), ('is', 10011)]


Removing 'stopwords'

In [29]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "This is a sample sentence, showing off the stop words filtration."

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

print(remove_stopwords(text))

sample sentence , showing stop words filtration .


##### Testing the Sentiment Scores

In [46]:
nltk.data.path.append("/Users/QuangAP/nltk_data")
vader_lexicon = nltk.data.load("/Users/QuangAP/nltk_data/vader_lexicon/vader_lexicon.txt")
print(vader_lexicon[:10])

$:	-1.5	0.


##### Sentiment Analysis

In [48]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

lexicon_score = "/Users/QuangAP/nltk_data/vader_lexicon/vader_lexicon.txt"
sid = SentimentIntensityAnalyzer(lexicon_score)

text = "Professor Quang is so cool!"
scores = sid.polarity_scores(text)

print("Sentiment scores: ", scores)

# Interpret the sentiment
if scores['compound'] >= 0.5:
    print("Positive")
elif scores['compound'] <= -0.5:
    print("Negative")
else:
    print("Neutral")

Sentiment scores:  {'neg': 0.0, 'neu': 0.549, 'pos': 0.451, 'compound': 0.5079}
Positive


##### Using the Porter Stemmer

In [53]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

# Example words to stem
words = ["running", "jumps", "easily", "fairly"]

# Stem the words
stems = [porter_stemmer.stem(word) for word in words]

print("Original words: ", words)
print("Stemed words: ", stems)

Original words:  ['running', 'jumps', 'easily', 'fairly']
Stemed words:  ['run', 'jump', 'easili', 'fairli']


##### Using the Snowball Stemmer

In [55]:
import nltk
from nltk.stem.snowball import SnowballStemmer

# Create an instance of SnowballStemer for English
snowball_stemmer = SnowballStemmer("english")

# Example words to stem
words = ["running", "jumps", "easily", "fairly"]

# Stem the words
stems = [snowball_stemmer.stem(word) for word in words]

print("Original words: ", words)
print("Stemed words: ", stems)

Original words:  ['running', 'jumps', 'easily', 'fairly']
Stemed words:  ['run', 'jump', 'easili', 'fair']


##### Using the Lancaster Stemmer

In [56]:
from nltk.stem import LancasterStemmer

# Create an instance of LancasterStemmer
lancaster_stemmer = LancasterStemmer()

# Example words to stem
words = ["running", "jumps", "easily", "fairly"]

# Stem the words
stems = [lancaster_stemmer.stem(word) for word in words]

print("Original words: ", words)
print("Stemed words: ", stems)

Original words:  ['running', 'jumps', 'easily', 'fairly']
Stemed words:  ['run', 'jump', 'easy', 'fair']


##### Removing Punctuations

In [60]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def remove_punctuation(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Filter out punctuation tokens
    words = [word for word in tokens if word.isalnum()]
    # Join the words back into a single string
    return ' '.join(words)

text = "This is a sample sentence, showing off the stop words filtration!"
cleaned_text = remove_punctuation(text)

print(cleaned_text)

This is a sample sentence showing off the stop words filtration


[nltk_data] Downloading package punkt to /Users/QuangAP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### Removing Numbers and Special Characters
- Using Regular Expression with NLTK

In [61]:
import re
import nltk
from nltk.tokenize import word_tokenize

# Download the necessary NLTK data files
#nltk.download('punkt')

def remove_numbers_and_special_characters(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Filter out numbers and special characters
    filtered_tokens = [word for word in tokens if word.isalpha()]
    # Join the filtered tokens back into a single string
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

text = "This is a sample sentence, showing off the stop words filtration! It has numbers like 123 and special characters like @#!"
cleaned_text = remove_numbers_and_special_characters(text)

print("Original text:", text)
print("Cleaned text:", cleaned_text)

Original text: This is a sample sentence, showing off the stop words filtration! It has numbers like 123 and special characters like @#!
Cleaned text: This is a sample sentence showing off the stop words filtration It has numbers like and special characters like


- Using NLTK with POS Tagging to Keep Only Words

In [62]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download the necessary NLTK data files
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def remove_numbers_and_special_characters(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Filter out tokens that are not alphabetic
    words = [word for word in tokens if word.isalpha()]
    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text

text = "This is a sample sentence, showing off the stop words filtration! It has numbers like 123 and special characters like @#!"
cleaned_text = remove_numbers_and_special_characters(text)

print("Original text:", text)
print("Cleaned text:", cleaned_text)

[nltk_data] Downloading package punkt to /Users/QuangAP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/QuangAP/nltk_data...


Original text: This is a sample sentence, showing off the stop words filtration! It has numbers like 123 and special characters like @#!
Cleaned text: This is a sample sentence showing off the stop words filtration It has numbers like and special characters like


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/QuangAP/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
