In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/patash/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

doc = 'Why, sometimes I`ve believed as many as 6 impossible things before breakfast?'
tokens = word_tokenize(doc)
print(tokens)

span = tokens[1:7]  # Indices des mots
print(span)

['Why', ',', 'sometimes', 'I', '`', 've', 'believed', 'as', 'many', 'as', '6', 'impossible', 'things', 'before', 'breakfast', '?']
[',', 'sometimes', 'I', '`', 've', 'believed']


In [None]:
# Stopwords Removal + syntaxe 

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
doc = '''Linguistics and Natural Language Processing (NLP) are closely linked. \n Linguistics is the scientific study of language, encompassing its structure, meaning, and context. \n It provides foundational knowledge about language syntax, semantics, pragmatics, and phonetics. \n NLP applies this linguistic knowledge in computational algorithms to enable computers to understand, interpret, and generate human language. By leveraging linguistic principles, NLP seeks to bridge the gap between human communication and computer understanding, enabling tasks like translation, sentiment analysis, and voice recognition.'''

# Tokenisation
tokens = word_tokenize(doc)

# Nettoyage: suppression des caractères non alphabétiques
tokens_cleaned = [re.sub(r'\W+', '', token) for token in tokens if re.sub(r'\W+', '', token) != '']

# Filtrage des stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens_cleaned if word.lower() not in stop_words]

print("Original Tokens:", tokens)
print("Filtered Tokens:", filtered_tokens)

Original Tokens: ['Linguistics', 'and', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'are', 'closely', 'linked', '.', 'Linguistics', 'is', 'the', 'scientific', 'study', 'of', 'language', ',', 'encompassing', 'its', 'structure', ',', 'meaning', ',', 'and', 'context', '.', 'It', 'provides', 'foundational', 'knowledge', 'about', 'language', 'syntax', ',', 'semantics', ',', 'pragmatics', ',', 'and', 'phonetics', '.', 'NLP', 'applies', 'this', 'linguistic', 'knowledge', 'in', 'computational', 'algorithms', 'to', 'enable', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', '.', 'By', 'leveraging', 'linguistic', 'principles', ',', 'NLP', 'seeks', 'to', 'bridge', 'the', 'gap', 'between', 'human', 'communication', 'and', 'computer', 'understanding', ',', 'enabling', 'tasks', 'like', 'translation', ',', 'sentiment', 'analysis', ',', 'and', 'voice', 'recognition', '.']
Filtered Tokens: ['Linguistics', 'Natural', 'Language', 'Processing', 'NLP'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Stemming - pour retirer la racine des mots

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print(stemmed)

['linguist', 'natur', 'languag', 'process', 'nlp', 'close', 'link', 'linguist', 'scientif', 'studi', 'languag', 'encompass', 'structur', 'mean', 'context', 'provid', 'foundat', 'knowledg', 'languag', 'syntax', 'semant', 'pragmat', 'phonet', 'nlp', 'appli', 'linguist', 'knowledg', 'comput', 'algorithm', 'enabl', 'comput', 'understand', 'interpret', 'gener', 'human', 'languag', 'leverag', 'linguist', 'principl', 'nlp', 'seek', 'bridg', 'gap', 'human', 'commun', 'comput', 'understand', 'enabl', 'task', 'like', 'translat', 'sentiment', 'analysi', 'voic', 'recognit']


In [None]:
# Lemmatization - transformer les mots en forme initiale/basique: suis - être
!pip install spacy
import spacy

#load spaCy's English language model
nlp = spacy.load('en_core_web_sm')

doc = nlp(" ".join(filtered_tokens))

lemmatized = [token.lemma_ for token in doc]
print('Lemmatized: ', lemmatized)


Lemmatized:  ['Linguistics', 'Natural', 'Language', 'Processing', 'NLP', 'closely', 'link', 'Linguistics', 'scientific', 'study', 'language', 'encompass', 'structure', 'mean', 'context', 'provide', 'foundational', 'knowledge', 'language', 'syntax', 'semantic', 'pragmatic', 'phonetic', 'NLP', 'apply', 'linguistic', 'knowledge', 'computational', 'algorithm', 'enable', 'computer', 'understand', 'interpret', 'generate', 'human', 'language', 'leverage', 'linguistic', 'principle', 'NLP', 'seek', 'bridge', 'gap', 'human', 'communication', 'computer', 'understanding', 'enable', 'task', 'like', 'translation', 'sentiment', 'analysis', 'voice', 'recognition']


In [None]:
doc = '''Charles Lutwidge Dodgson, better known by his pen name Lewis Carroll, was an English author, poet, mathematician and photographer. His most notable works are Alice's Adventures in Wonderland (1865) and its sequel Through the Looking-Glass (1871).'''

tokens = word_tokenize(doc)

# Initialiser le stemmer
stemmer = PorterStemmer()
# Appliquer le stemming aux tokens
stemmed = [stemmer.stem(word) for word in tokens]
print('Stemmed:' ,stemmed)

# Charger le modèle de langue anglaise de spaCy
nlp = spacy.load('en_core_web_sm')

# Traiter le texte avec spaCy
doc = nlp(doc)

# Filtrer les mots vides et les caractères non alphabétiquesc
stop_words = set(stopwords.words('english'))
filtered_doc = [word for word in doc if word not in stopwords.words('english')]

# Appliquer la lemmatisation
lemmatized = [token.lemma_ for token in filtered_doc]
print('Lemmatized: ', lemmatized)

Stemmed: ['charl', 'lutwidg', 'dodgson', ',', 'better', 'known', 'by', 'hi', 'pen', 'name', 'lewi', 'carrol', ',', 'wa', 'an', 'english', 'author', ',', 'poet', ',', 'mathematician', 'and', 'photograph', '.', 'hi', 'most', 'notabl', 'work', 'are', 'alic', "'s", 'adventur', 'in', 'wonderland', '(', '1865', ')', 'and', 'it', 'sequel', 'through', 'the', 'looking-glass', '(', '1871', ')', '.']
Lemmatized:  ['Charles', 'Lutwidge', 'Dodgson', ',', 'well', 'know', 'by', 'his', 'pen', 'name', 'Lewis', 'Carroll', ',', 'be', 'an', 'english', 'author', ',', 'poet', ',', 'mathematician', 'and', 'photographer', '.', 'his', 'most', 'notable', 'work', 'be', 'Alice', "'s", 'Adventures', 'in', 'Wonderland', '(', '1865', ')', 'and', 'its', 'sequel', 'through', 'the', 'Looking', '-', 'Glass', '(', '1871', ')', '.']


In [None]:
# POS (Part Of Speech) - NNP - Nom propre singulier

nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag

tagged = pos_tag(tokens)
for word, tag in tagged:
    print(f'Word: {word}, POS: {tag}')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/patash/nltk_data...


Word: Charles, POS: NNP
Word: Lutwidge, POS: NNP
Word: Dodgson, POS: NNP
Word: ,, POS: ,
Word: better, POS: JJR
Word: known, POS: VBN
Word: by, POS: IN
Word: his, POS: PRP$
Word: pen, POS: JJ
Word: name, POS: NN
Word: Lewis, POS: NNP
Word: Carroll, POS: NNP
Word: ,, POS: ,
Word: was, POS: VBD
Word: an, POS: DT
Word: English, POS: JJ
Word: author, POS: NN
Word: ,, POS: ,
Word: poet, POS: NN
Word: ,, POS: ,
Word: mathematician, POS: JJ
Word: and, POS: CC
Word: photographer, POS: NN
Word: ., POS: .
Word: His, POS: PRP$
Word: most, POS: RBS
Word: notable, POS: JJ
Word: works, POS: NNS
Word: are, POS: VBP
Word: Alice, POS: NNP
Word: 's, POS: POS
Word: Adventures, POS: NNS
Word: in, POS: IN
Word: Wonderland, POS: NNP
Word: (, POS: (
Word: 1865, POS: CD
Word: ), POS: )
Word: and, POS: CC
Word: its, POS: PRP$
Word: sequel, POS: NN
Word: Through, POS: IN
Word: the, POS: DT
Word: Looking-Glass, POS: NNP
Word: (, POS: (
Word: 1871, POS: CD
Word: ), POS: )
Word: ., POS: .


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Named Entity Recognition (NER)

from nltk import pos_tag, ne_chunk, word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

doc = 'Apple is looking at buying U.K. startup for $1 billion'
tokens = word_tokenize(doc)

tagged = pos_tag(tokens)

entities = ne_chunk(tagged)
print(entities)

(S
  (GPE Apple/NNP)
  is/VBZ
  looking/VBG
  at/IN
  buying/VBG
  U.K./NNP
  startup/NN
  for/IN
  $/$
  1/CD
  billion/CD)


[nltk_data] Downloading package punkt to /Users/patash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/patash/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/patash/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/patash/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
# Vectorization
# Bag of Words
from collections import Counter
doc = 'Why, sometimes I`ve believed as many as 6 impossible things before breakfast. There goes the shawl again!'
tokens = word_tokenize(doc)

stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in tokens]
filtered_words = [word for word in tokens if not word.lower() in stopwords.words('english')]

bow = Counter(filtered_words)
print(bow)


Counter({',': 1, 'sometimes': 1, '`': 1, 'believed': 1, 'many': 1, '6': 1, 'impossible': 1, 'things': 1, 'breakfast': 1, '.': 1, 'goes': 1, 'shawl': 1, '!': 1})


In [None]:
# Créer le vocabulaire de plusiers documents 
from sklearn.feature_extraction.text import CountVectorizer

docs = ['This is the first document.','This document is the second document.','And this is the third one.', 'This one is the 4th']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

print("Feature names:", vectorizer.get_feature_names_out())
print("Vectorized representation:\n", X.toarray())

Feature names: ['4th' 'and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Vectorized representation:
 [[0 0 1 1 1 0 0 1 0 1]
 [0 0 2 0 1 0 1 1 0 1]
 [0 1 0 0 1 1 0 1 1 1]
 [1 0 0 0 1 1 0 1 0 1]]


In [21]:
# TF-IDF (Term Frequency-Inverse Document Frequency)

from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the Documents
documents = [
    "I love reading books.",
    "Reading in the morning is refreshing.",
    "I love morning coffee."
]

# Step 2: Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Step 3: Analyze the Output
# Print the vocabulary (unique words)
print("Vocabulary:", vectorizer.get_feature_names_out())

# Print the TF-IDF values for each document
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

Vocabulary: ['books' 'coffee' 'in' 'is' 'love' 'morning' 'reading' 'refreshing' 'the']
TF-IDF Matrix:
[[0.68091856 0.         0.         0.         0.51785612 0.
  0.51785612 0.         0.        ]
 [0.         0.         0.44036207 0.44036207 0.         0.3349067
  0.3349067  0.44036207 0.44036207]
 [0.         0.68091856 0.         0.         0.51785612 0.51785612
  0.         0.         0.        ]]


In [3]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

processed_poems = []
with open('/Users/patash/PSTB/Week_5/day_1/famous_poems.json', 'r') as f:
    data = json.load(f)

#cleaning stopwords
for poem in data:
    tokens = word_tokenize(poem['text'])
    filtered_tokens = [word for word in tokens if not word.lower() in stopwords.words('english') and word.isalpha()]
    processed_poems.append(' '.join(filtered_tokens))

print(processed_poems)


['Two roads diverged yellow wood sorry could travel one traveler long stood looked one far could bent undergrowth', 'Whose woods think know house village though see stopping watch woods fill snow', 'keep head losing blaming trust men doubt make allowance doubting']


In [4]:
!pip install nltk gensim
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import gensim.downloader as api
nltk.download('punkt')

dataset = api.load("text8")

Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.3.4-py3-none-any.whl.metadata (9.7 kB)
Collecting scipy>=1.7.0 (from gensim)
  Downloading scipy-1.10.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (100 kB)
Collecting numpy>=1.18.5 (from gensim)
  Downloading numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting simpful==2.12.0 (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading simpful-2.12.0-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso==1.8.1 (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pandas (from FuzzyTM>=0.4.0->gensim)
  Downloading pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting miniful (from fst-pso==1.8.1->pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata

[nltk_data] Downloading package punkt to /Users/patash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




In [7]:

model = Word2Vec(sentences=dataset, vector_size=100, window=5, min_count=1, workers=4)

example_result = model.wv.most_similar(positive=['woman', 'king'], negative=['savory'], topn=1)
print(example_result)

[('queen', 0.5899737477302551)]


In [9]:
import gensim.downloader as api

# Load the pre-trained Word2Vec model (Google News 300D)
word2vec_model = api.load("word2vec-google-news-300")

# Find the top 5 most similar words to "king"
similar_words = word2vec_model.most_similar("king", topn=5)

# Print the results
for word, similarity in similar_words:
    print(f"Word: {word}, Similarity: {similarity}")

Word: kings, Similarity: 0.7138046622276306
Word: queen, Similarity: 0.6510956287384033
Word: monarch, Similarity: 0.6413194537162781
Word: crown_prince, Similarity: 0.6204219460487366
Word: prince, Similarity: 0.6159993410110474


In [10]:
# SPAM
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("/Users/patash/PSTB/Week_5/day_1/spam.csv", encoding="latin-1")
df = df[["v1", "v2"]]  # Keep only the label and text columns
df.columns = ["label", "text"]

# Convert labels to binary (spam = 1, ham = 0)
df["label"] = df["label"].map({"spam": 1, "ham": 0})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

In [13]:
print(df.head())

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [11]:
# Step 2: Feature Extraction With TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
# Step 3: Train A Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9668161434977578
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

