In [1]:
# NLP Study Plan
# Author: Oscar Tibaduiza
# Date: 23/05/2024

In [2]:
# Introduction
# This notebook presents a study plan to learn Natural Language Processing (NLP). 
# The goal is to provide a structured guide covering basic concepts to more advanced topics.

# ------------------------------
# 1. Fundamentals of NLP
# ------------------------------

# 1.1. Introduction to NLP
# - What is NLP?
# - Applications of NLP
# - Popular tools and libraries (NLTK, spaCy, Hugging Face Transformers)

# 1.2. Text Preprocessing
# - Tokenization
# - Normalization (lowercasing, lemmatization, stemming)
# - Stop words removal
# - Regular expressions for text cleaning

In [3]:
# Practical exercise: Tokenization and basic preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Example text
text = "Hello! How are you? This is a sample text for basic preprocessing."

In [6]:
# Tokenization
tokens = word_tokenize(text.lower())
print("Tokens:", tokens)

Tokens: ['hello', '!', 'how', 'are', 'you', '?', 'this', 'is', 'a', 'sample', 'text', 'for', 'basic', 'preprocessing', '.']


In [7]:
# Stop words removal
stop_words = set(stopwords.words('english'))
tokens_without_stopwords = [word for word in tokens if word not in stop_words]
print("Tokens without stopwords:", tokens_without_stopwords)

Tokens without stopwords: ['hello', '!', '?', 'sample', 'text', 'basic', 'preprocessing', '.']


In [8]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens_without_stopwords]
print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['hello', '!', '?', 'sample', 'text', 'basic', 'preprocessing', '.']


In [9]:
# ------------------------------
# 2. Text Representation
# ------------------------------

In [10]:
# 2.1. Bag of Words (BoW)
# - Concept of bag of words
# - Building a bag of words

In [11]:
# Practical exercise: Create a bag of words
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Bag of Words:\n", X.toarray())
print("Features:\n", vectorizer.get_feature_names_out())

Bag of Words:
 [[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Features:
 ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [12]:
# 2.2. TF-IDF
# - Concept of TF-IDF
# - Implementation of TF-IDF

In [13]:
# Practical exercise: Create a TF-IDF matrix
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print("TF-IDF Matrix:\n", X_tfidf.toarray())
print("TF-IDF Features:\n", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix:
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
TF-IDF Features:
 ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [14]:
# ------------------------------
# 3. Language Models
# ------------------------------

# 3.1. N-gram Models
# - Introduction to n-grams
# - Implementation of n-gram models

In [15]:
# Practical exercise: Generate n-grams
from nltk import ngrams

bigrams = list(ngrams(tokens, 2))
print("Bigrams:", bigrams)

Bigrams: [('hello', '!'), ('!', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'this'), ('this', 'is'), ('is', 'a'), ('a', 'sample'), ('sample', 'text'), ('text', 'for'), ('for', 'basic'), ('basic', 'preprocessing'), ('preprocessing', '.')]


In [16]:
# 3.2. Word Embeddings
# - Concept of word embeddings
# - Introduction to Word2Vec and GloVe

In [17]:
import scipy
print(scipy.__version__)

1.10.0


In [18]:
from scipy.linalg import triu

In [19]:
# Practical exercise: Train a Word2Vec model
from gensim.models import Word2Vec

sentences = [word_tokenize(doc.lower()) for doc in corpus]
word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)
print("Vector for 'document':", word2vec_model.wv['document'])

Vector for 'document': [-1.07023621e-03  4.75227891e-04  1.02121495e-02  1.80161148e-02
 -1.86066683e-02 -1.42376255e-02  1.29124559e-02  1.79410204e-02
 -1.00314002e-02 -7.53031159e-03  1.47636728e-02 -3.07031744e-03
 -9.07182693e-03  1.31084993e-02 -9.72215272e-03 -3.63255502e-03
  5.75860823e-03  1.98871316e-03 -1.65728815e-02 -1.88926701e-02
  1.46204550e-02  1.01437848e-02  1.35136731e-02  1.52757065e-03
  1.27034336e-02 -6.80697383e-03 -1.89409894e-03  1.15419105e-02
 -1.50391106e-02 -7.87741225e-03 -1.50275519e-02 -1.86425447e-03
  1.90778263e-02 -1.46398256e-02 -4.67134174e-03 -3.87989124e-03
  1.61594115e-02 -1.18607190e-02  8.52437806e-05 -9.51008499e-03
 -1.92053970e-02  1.00120399e-02 -1.75162610e-02 -8.78598820e-03
 -6.87807769e-05 -5.96724800e-04 -1.53191071e-02  1.92321129e-02
  9.96452942e-03  1.84679516e-02]


In [20]:
# ------------------------------
# 4. Advanced Models and Applications
# ------------------------------

In [21]:
# 4.1. Transformers and BERT
# - Introduction to transformers
# - Basic implementation of BERT for text classification

In [22]:
import tensorflow as tf
print(tf.__version__)

2.16.1


In [23]:
import torch
print(torch.__version__)

2.3.0+cpu


In [25]:
# Practical exercise: Text classification with BERT
from transformers import pipeline

classifier = pipeline('sentiment-analysis')
result = classifier("This NLP course is amazing!")
print("Classification result:", result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Classification result: [{'label': 'POSITIVE', 'score': 0.9998804330825806}]


In [26]:
# 4.2. Applications of NLP
# - Sentiment analysis
# - Text summarization
# - Machine translation
# - Chatbots

In [27]:
# ------------------------------
# 5. Final Projects
# ------------------------------

In [28]:
# 5.1. Project 1: Sentiment Analysis
# - Collect social media data
# - Preprocess data
# - Train a sentiment analysis model

In [29]:
# 5.2. Project 2: Text Generation
# - Implement a text generation model
# - Train the model with a large corpus
# - Evaluate the quality of the generated text

In [30]:
# ------------------------------
# Additional Resources
# ------------------------------

In [31]:
# - Recommended books and articles
# - Online courses and tutorials
# - Code repositories and datasets

# End of the study plan