# Imports

In [6]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Sentence Tokenization

In [42]:
text = "Chocolate Chip Cookies: Combine 2 cups of flour, 1 cup of sugar, 1 cup of brown sugar, 1 tsp of baking soda, and a pinch of salt. Cream together with 1 cup of softened butter, add 2 eggs and 2 tsp of vanilla extract. Fold in 2 cups of chocolate chips. Drop spoonfuls on a baking sheet and bake at 350°F for 10 minutes until golden brown."

In [43]:
text

'Chocolate Chip Cookies: Combine 2 cups of flour, 1 cup of sugar, 1 cup of brown sugar, 1 tsp of baking soda, and a pinch of salt. Cream together with 1 cup of softened butter, add 2 eggs and 2 tsp of vanilla extract. Fold in 2 cups of chocolate chips. Drop spoonfuls on a baking sheet and bake at 350°F for 10 minutes until golden brown.'

In [44]:
sent_tokenize(text)

['Chocolate Chip Cookies: Combine 2 cups of flour, 1 cup of sugar, 1 cup of brown sugar, 1 tsp of baking soda, and a pinch of salt.',
 'Cream together with 1 cup of softened butter, add 2 eggs and 2 tsp of vanilla extract.',
 'Fold in 2 cups of chocolate chips.',
 'Drop spoonfuls on a baking sheet and bake at 350°F for 10 minutes until golden brown.']

# Word Tokenization

In [45]:
sentence = sent_tokenize(text)[0]

In [46]:
print(word_tokenize(sentence))

['Chocolate', 'Chip', 'Cookies', ':', 'Combine', '2', 'cups', 'of', 'flour', ',', '1', 'cup', 'of', 'sugar', ',', '1', 'cup', 'of', 'brown', 'sugar', ',', '1', 'tsp', 'of', 'baking', 'soda', ',', 'and', 'a', 'pinch', 'of', 'salt', '.']


# POS Tagging

In [47]:
words = word_tokenize(sentence)
print(pos_tag(words))

[('Chocolate', 'NNP'), ('Chip', 'NNP'), ('Cookies', 'NNPS'), (':', ':'), ('Combine', 'VB'), ('2', 'CD'), ('cups', 'NNS'), ('of', 'IN'), ('flour', 'NN'), (',', ','), ('1', 'CD'), ('cup', 'NN'), ('of', 'IN'), ('sugar', 'NN'), (',', ','), ('1', 'CD'), ('cup', 'NN'), ('of', 'IN'), ('brown', 'JJ'), ('sugar', 'NN'), (',', ','), ('1', 'CD'), ('tsp', 'NN'), ('of', 'IN'), ('baking', 'VBG'), ('soda', 'NN'), (',', ','), ('and', 'CC'), ('a', 'DT'), ('pinch', 'NN'), ('of', 'IN'), ('salt', 'NN'), ('.', '.')]


# Stop Words

In [30]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'your', 'm', 'hers', 'hadn', 'himself', 'yourself', 'a', 'had', 'own', "hadn't", 'have', 'she', 'weren', "should've", 'are', 'with', 'me', 'about', "you're", 'there', 'so', "it's", 'who', "weren't", 'most', 'ourselves', 'didn', "wasn't", 'for', 'wouldn', 'between', 'of', 'has', 'where', 'he', 'when', 'to', 've', 'don', 'above', 'at', 'then', 'once', "didn't", "isn't", 'll', 'aren', 'herself', 'hasn', "don't", 'does', 'all', 'that', 'very', 'the', "aren't", 'out', 'being', 'or', 'by', 'wasn', 'into', 'through', "shouldn't", 'other', 'here', 'not', 'yours', 'having', 'isn', 'both', 'no', 'its', 'while', 'is', 'mustn', 'again', 'after', 'his', 'now', 'can', 'themselves', 'ain', 'few', 'do', 'if', 'doesn', 'haven', "won't", "mustn't", 'him', 'shouldn', 'those', "needn't", "mightn't", 'ma', 'against', 'same', 'whom', 'it', 's', 'further', "shan't", 'been', 'am', 'd', 'off', "haven't", 'theirs', "couldn't", 'from', 'nor', 'too', 'doing', 'just', 'their', 're', 'what', 'them', 'this', 'some'

In [32]:
words = word_tokenize(sentence)
filtered_sentence = [w for w in words if not w.lower() in stop_words]
print(filtered_sentence)

['Chocolate', 'Chip', 'Cookies', ':', 'Combine', '2', 'cups', 'flour', ',', '1', 'cup', 'sugar', ',', '1', 'cup', 'brown', 'sugar', ',', '1', 'tsp', 'baking', 'soda', ',', 'pinch', 'salt', '.']


# Stemmer

In [33]:
stemmer = PorterStemmer()
sentence = "There are several types of stemming algorithms."
words = word_tokenize(sentence)
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemmed Words:", stemmed_words)

Stemmed Words: ['there', 'are', 'sever', 'type', 'of', 'stem', 'algorithm', '.']


# Lemmatizer

In [34]:
lemmatizer = WordNetLemmatizer()
sentence = "The leaves on the tree are falling."
words = word_tokenize(sentence)
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['The', 'leaf', 'on', 'the', 'tree', 'are', 'falling', '.']


# Regular Expressions

In [35]:
import re

In [36]:
text = "my name is Ramiz."
re.search(r"\bname\b",text)

<re.Match object; span=(3, 7), match='name'>

In [37]:
re.findall(r"name",text)

['name']

# BOW

In [38]:
from collections import Counter

In [48]:
fruits = ["apple","mango","apple"]
Counter(fruits)

Counter({'apple': 2, 'mango': 1})

# Most Preprocessing

In [39]:
# First we word tokenize and then we remove stop words

# File Reading

In [None]:
with open("file.txt", 'r') as file:
    file_content = file.read()

# TF-IDF

In [None]:
def doc_word(word):
    count = 0
    for file in file_BOW:
        if word in file_BOW[file]:
            count+=1
    return count

def word_count(word,file):
    count = 0
    for w in file_BOW[file]:
        if word == w:
            count+=1
    return count
np.log(5/doc_word(word))*word_count(word,file)/len(file_BOW[file])

# Word2Vec

In [52]:
import gensim.downloader as api

In [None]:
model = api.load("word2vec-google-news-300")

In [None]:
word_vector = model['queen']-model['woman']+model['man']

In [None]:
similar_words = model.most_similar(positive=[word_vector],topn=3)
print(similar_words)

# Visualize Word2Vec

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

words = ['king', 'queen', 'man', 'woman', 'paris', 'berlin', 'france', 'germany', 'apple', 'microsoft', 'google', 'facebook']

word_vectors = np.array([model[word] for word in words])

pca = PCA(n_components=2)
word_vectors_2d = pca.fit_transform(word_vectors)

plt.figure(figsize=(10, 8))
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1])

for i, word in enumerate(words):
    plt.annotate(word, xy=(word_vectors_2d[i, 0], word_vectors_2d[i, 1]))

plt.show()

# Classification using Word2Vec

In [None]:
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def document_vector(doc):
    doc = [word for word in word_tokenize(doc.lower()) if word in model.key_to_index]
    return np.mean(model[doc], axis=0) if doc else np.zeros(300)

X_w2v = np.array([document_vector(text) for text in texts])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_w2v, labels, test_size=0.2, random_state=42)

model_w2v = LogisticRegression(max_iter=1000)
model_w2v.fit(X_train, y_train)

predictions_w2v = model_w2v.predict(X_test)
accuracy_w2v = accuracy_score(y_test, predictions_w2v)
print(f"Accuracy with Word2Vec embeddings: {accuracy_w2v}")