In [9]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util

# 1. Sample JSON Data
data = [
    {"id": 1, "text": "I love data science and machine learning."},
    {"id": 2, "text": "Natural language processing is amazing."},
    {"id": 3, "text": "Machine learning is challenging but fun."}
]

# Extract documents
documents = [item["text"] for item in data]

# --- 2. TF-IDF Vectorization ---
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

# --- 3. LDA Topic Modeling ---
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(documents)

lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(count_matrix)

print("\nLDA Topics:")
for i, topic in enumerate(lda.components_):
    words = [count_vectorizer.get_feature_names_out()[j] for j in topic.argsort()[-5:]]
    print(f"Topic {i+1}: {', '.join(words)}")

# --- 4. Word Embeddings (BERT via Sentence Transformers) ---
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and efficient

data_embedding = model.encode("data", convert_to_tensor=True)
learning_embedding = model.encode("learning", convert_to_tensor=True)

similarity = util.pytorch_cos_sim(data_embedding, learning_embedding).item()
print(f"\nBERT similarity between 'data' and 'learning': {similarity:.4f}")

# --- 5. Sentiment Analysis (TextBlob) ---
print("\nSentiment Analysis (TextBlob):")
for doc in documents:
    blob = TextBlob(doc)
    print(f"'{doc}' → Sentiment polarity: {blob.sentiment.polarity:.2f}")

# --- 6. Sentiment Analysis (VADER) ---
analyzer = SentimentIntensityAnalyzer()
print("\nSentiment Analysis (VADER):")
for doc in documents:
    score = analyzer.polarity_scores(doc)
    print(f"'{doc}' → Sentiment Scores: {score}")



TF-IDF Matrix:
[[0.         0.         0.49047908 0.         0.         0.37302199
  0.49047908 0.37302199 0.         0.         0.49047908]
 [0.5        0.         0.         0.         0.5        0.
  0.         0.         0.5        0.5        0.        ]
 [0.         0.5628291  0.         0.5628291  0.         0.42804604
  0.         0.42804604 0.         0.         0.        ]]

LDA Topics:
Topic 1: love, data, science, machine, learning
Topic 2: challenging, language, natural, amazing, processing


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


BERT similarity between 'data' and 'learning': 0.4596

Sentiment Analysis (TextBlob):
'I love data science and machine learning.' → Sentiment polarity: 0.50
'Natural language processing is amazing.' → Sentiment polarity: 0.35
'Machine learning is challenging but fun.' → Sentiment polarity: 0.40

Sentiment Analysis (VADER):
'I love data science and machine learning.' → Sentiment Scores: {'neg': 0.0, 'neu': 0.588, 'pos': 0.412, 'compound': 0.6369}
'Natural language processing is amazing.' → Sentiment Scores: {'neg': 0.0, 'neu': 0.323, 'pos': 0.677, 'compound': 0.743}
'Machine learning is challenging but fun.' → Sentiment Scores: {'neg': 0.0, 'neu': 0.41, 'pos': 0.59, 'compound': 0.6956}
