<a href="https://colab.research.google.com/github/Sahel-Eskandar/Text-Mining-Classification-Analysis/blob/main/Text_Feature_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, PCA
from sklearn.manifold import TSNE
import numpy as np
import spacy

# Input text
text = "Natural Language Processing (NLP) is a subfield of computer science, " \
       "artificial intelligence, and computational linguistics concerned with " \
       "the interactions between computers and human (natural) languages. " \
       "It focuses on how to program computers to process and analyze large " \
       "amounts of natural language data."

# Tokenize the text
tokens = word_tokenize(text)

In [3]:
print(len(tokens), tokens[:20])

50 ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', ',', 'and', 'computational', 'linguistics', 'concerned']


In [4]:
# CountVectorizer
count_vec = CountVectorizer()
X_count = count_vec.fit_transform([text])
print('CountVectorizer:')
print(count_vec.get_feature_names_out()[:10])
print(X_count.toarray()[0][:10])

CountVectorizer:
['amounts' 'analyze' 'and' 'artificial' 'between' 'computational'
 'computer' 'computers' 'concerned' 'data']
[1 1 3 1 1 1 1 2 1 1]


In [5]:
# TF-IDF
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform([text])
print('TF-IDF:')
print(tfidf_vec.get_feature_names_out()[:10])
print(X_tfidf.toarray()[0][:10])


TF-IDF:
['amounts' 'analyze' 'and' 'artificial' 'between' 'computational'
 'computer' 'computers' 'concerned' 'data']
[0.12803688 0.12803688 0.38411064 0.12803688 0.12803688 0.12803688
 0.12803688 0.25607376 0.12803688 0.12803688]


In [6]:
# Word embeddings (using spaCy)
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
embeddings = [token.vector for token in doc]
print('Word embeddings:')
print(len(embeddings))

Word embeddings:
50


In [7]:
# Compare similarities using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Similarity between "Natural" and "Language"
word1 = "natural"
word2 = "language"
embedding_sim = cosine_similarity(embeddings[tokens.index(word1)].reshape(1, -1),
                                   embeddings[tokens.index(word2)].reshape(1, -1))
tfidf_sim = cosine_similarity(X_tfidf[:, tfidf_vec.vocabulary_[word1]].reshape(1, -1),
                               X_tfidf[:, tfidf_vec.vocabulary_[word2]].reshape(1, -1))
print(f'Similarity between "{word1}" and "{word2}" using word embeddings:', embedding_sim[0][0])
print(f'Similarity between "{word1}" and "{word2}" using TF-IDF:', tfidf_sim[0][0])

Similarity between "natural" and "language" using word embeddings: 0.23813576
Similarity between "natural" and "language" using TF-IDF: 1.0


In [8]:
# Bag of words
bag_of_words = {word: tokens.count(word) for word in set(tokens)}
print('Bag of words:')
print(list(bag_of_words.items())[:10])

Bag of words:
[('a', 1), ('data', 1), ('concerned', 1), ('natural', 2), ('analyze', 1), ('intelligence', 1), ('to', 2), ('interactions', 1), ('Language', 1), ('computers', 2)]


In [9]:
# Bag of n-grams
n = 2
ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
bag_of_ngrams = {ngram: ngrams.count(ngram) for ngram in set(ngrams)}
print('Bag of n-grams:')
print(list(bag_of_ngrams.items())[:10])

Bag of n-grams:
[(('(', 'natural'), 1), (('linguistics', 'concerned'), 1), (('program', 'computers'), 1), (('amounts', 'of'), 1), (('Natural', 'Language'), 1), (('and', 'analyze'), 1), (('with', 'the'), 1), (('.', 'It'), 1), (('between', 'computers'), 1), (('NLP', ')'), 1)]


In [10]:
# HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
hash_vec = HashingVectorizer(n_features=100)
X_hash = hash_vec.fit_transform([text])
print('HashingVectorizer:')
print(X_hash.shape, X_hash.toarray()[0][:10])

HashingVectorizer:
(1, 100) [ 0.          0.         -0.13483997  0.          0.          0.13483997
  0.          0.          0.         -0.13483997]


In [11]:
# Latent Dirichlet Allocation (LDA)
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, random_state=42)
X_lda = lda.fit_transform(X_tfidf)
print('LDA:')
print(X_lda)

LDA:
[[0.01600128 0.01600128 0.01600128 0.01600128 0.01600128 0.85598847
  0.01600128 0.01600128 0.01600128 0.01600128]]


In [12]:
# Non-negative Matrix Factorization (NMF)
nmf = NMF(n_components=10, random_state=42)
X_nmf = nmf.fit_transform(X_tfidf)
print('NMF:')
print(X_nmf)

NMF:
[[6.55551346e-01 4.28578416e-01 1.47212861e-02 2.40446075e-16
  1.54113864e-01 1.05308938e-01 5.73261840e-02 1.65705288e-01
  1.43820169e-01 2.48592727e-16]]


In [14]:
# Tokenize the text into sentences and words
sentences = nltk.sent_tokenize(text)
words = [nltk.word_tokenize(sentence) for sentence in sentences]

# Perform POS tagging on the words
pos_tags = [nltk.pos_tag(sentence) for sentence in words]

# Print the POS tags
for sentence in pos_tags:
    print(sentence)

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('subfield', 'NN'), ('of', 'IN'), ('computer', 'NN'), ('science', 'NN'), (',', ','), ('artificial', 'JJ'), ('intelligence', 'NN'), (',', ','), ('and', 'CC'), ('computational', 'JJ'), ('linguistics', 'NNS'), ('concerned', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('interactions', 'NNS'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('human', 'JJ'), ('(', '('), ('natural', 'JJ'), (')', ')'), ('languages', 'NNS'), ('.', '.')]
[('It', 'PRP'), ('focuses', 'VBZ'), ('on', 'IN'), ('how', 'WRB'), ('to', 'TO'), ('program', 'NN'), ('computers', 'NNS'), ('to', 'TO'), ('process', 'VB'), ('and', 'CC'), ('analyze', 'VB'), ('large', 'JJ'), ('amounts', 'NNS'), ('of', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('.', '.')]
