<a href="https://colab.research.google.com/github/Sophiewilliamson2/solar-system-assignment/blob/main/NLTK_Tutorial_Week2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing With Python's NLTK (Week 2)
_Starter notebook generated for Sophie (MSc AI)._

**Topics:** Getting Started · Tokenizing · Stop Words · Stemming · POS Tagging · Lemmatizing · Chunking · Chinking · NER · Concordance · Dispersion Plot · Frequency Distribution · Collocations

> Each section mirrors the Real Python tutorial.

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
# --- Setup: installs & NLTK data ---
!pip -q install nltk==3.8.1 matplotlib numpy
import nltk
nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger'); nltk.download('maxent_ne_chunker'); nltk.download('words')
nltk.download('book')
print('Setup complete.')

## 1) Tokenizing (sentences & words)

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
from nltk.tokenize import sent_tokenize, word_tokenize
example_string = (
    """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult.
"""
)
sents = sent_tokenize(example_string)
words = word_tokenize(example_string)
print('Sentences:', sents)
print('\nFirst 20 word tokens:', words[:20])

## 2) Filtering Stop Words

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
from nltk.corpus import stopwords
worf_quote = "Sir, I protest. I am not a merry man!"
tokens = word_tokenize(worf_quote)
stop_words = set(stopwords.words('english'))
filtered = [w for w in tokens if w.casefold() not in stop_words]
print('Original:', tokens)
print('Filtered:', filtered)

## 3) Stemming (Porter)

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
string_for_stemming = (
    """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do.
"""
)
tokens = word_tokenize(string_for_stemming)
stems = [stemmer.stem(w) for w in tokens]
list(zip(tokens, stems))[:20]

## 4) POS Tagging

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
sagan_quote = (
    """
If you wish to make an apple pie from scratch,
you must first invent the universe.
"""
)
words_in_sagan = word_tokenize(sagan_quote)
import nltk
pos_tags = nltk.pos_tag(words_in_sagan)
pos_tags[:20]

## 5) Lemmatizing (WordNet)

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print('scarves ->', lemmatizer.lemmatize('scarves'))
print('worst (noun) ->', lemmatizer.lemmatize('worst'))
print('worst (adjective) ->', lemmatizer.lemmatize('worst', pos='a'))
string_for_lemmatizing = 'The friends of DeSoto love scarves.'
words2 = word_tokenize(string_for_lemmatizing)
lemmas = [lemmatizer.lemmatize(w) for w in words2]
list(zip(words2, lemmas))

## 6) Chunking (NP grammar)

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
lotr_quote = "It's a dangerous business, Frodo, going out your door."
words_in_lotr = word_tokenize(lotr_quote)
lotr_pos = nltk.pos_tag(words_in_lotr)
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(lotr_pos)
tree

## 7) Chinking (exclude adjectives)

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
grammar2 = """
Chunk: {<.*>+}
       }<JJ>{
"""
chunk_parser2 = nltk.RegexpParser(grammar2)
tree2 = chunk_parser2.parse(lotr_pos)
tree2

## 8) Named Entity Recognition (NER)

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
ner_tree = nltk.ne_chunk(lotr_pos)
ner_tree

## 9) NLTK Book Corpora

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
from nltk.book import *  # noqa
texts()

## 10) Concordance

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
text8.concordance('man')
text8.concordance('woman')

## 11) Dispersion Plot

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
text8.dispersion_plot(['woman','lady','girl','gal','man','gentleman','boy','guy'])
text2.dispersion_plot(['Allenham','Whitwell','Cleveland','Combe'])

## 12) Frequency Distribution

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
from nltk import FreqDist
from nltk.corpus import stopwords
fd_all = FreqDist(text8)
print(fd_all.most_common(20))
stop = set(stopwords.words('english'))
meaningful = [w for w in text8 if w.casefold() not in stop]
fd_meaningful = FreqDist(meaningful)
print(fd_meaningful.most_common(20))
fd_meaningful.plot(20, cumulative=True)

## 13) Collocations (raw vs lemmatized)

In [None]:
# DISCLAIMER: This code was assisted by ChatGPT.
print('Raw collocations:')
text8.collocations()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in text8]
new_text = nltk.Text(lemmatized)
print('\nLemmatized collocations:')
new_text.collocations()