In [None]:
import re
import spacy

# Load SpaCy model for NLP parsing
nlp = spacy.load("en_core_web_sm")

# Example text
text = """
Despite concerns over job displacement, which were highlighted in a report by Hero Vired revealing that 82 percent of professionals fear job loss due to emerging technologies, the importance of upskilling cannot be overstated. The same report also noted that 78 percent of professionals view upskilling as a necessary response to these evolving conditions.
"""

# Step 1: Extract numerical trends using regex
percentages = re.findall(r'\b\d+(\.\d+)?\s?(percent|%)\b', text)

# Step 2: Extract fine-grained contexts for each percentage
def extract_fine_contexts(percentages, text):
    results = []
    doc = nlp(text)  # Parse text with SpaCy

    for percentage, unit in percentages:
        trend = f"{percentage} {unit}"
        trend_results = {"trend": trend, "contexts": []}

        # Find sentences containing the trend
        for sent in doc.sents:
            if trend in sent.text:
                # Extract key phrases related to the trend
                for chunk in sent.noun_chunks:
                    if trend in chunk.text:
                        continue  # Skip the trend itself
                    trend_results["contexts"].append(chunk.text.strip())
                for token in sent:
                    if token.dep_ in {"ROOT", "acomp", "xcomp"}:
                        trend_results["contexts"].append(token.text.strip())
        results.append(trend_results)

    return results

# Get results
results = extract_fine_contexts(percentages, text)

# Display results
for res in results:
    print(f"Trend: {res['trend']}")
    for i, context in enumerate(res["contexts"], 1):
        print(f"Context{i}: {context}")


Trend:  percent
Context1: concerns
Context2: job displacement
Context3: which
Context4: a report
Context5: Hero Vired
Context6: professionals
Context7: job loss
Context8: emerging technologies
Context9: the importance
Context10: upskilling
Context11: overstated
Context12: The same report
Context13: professionals
Context14: a necessary response
Context15: these evolving conditions
Context16: noted
Context17: upskilling
Trend:  percent
Context1: concerns
Context2: job displacement
Context3: which
Context4: a report
Context5: Hero Vired
Context6: professionals
Context7: job loss
Context8: emerging technologies
Context9: the importance
Context10: upskilling
Context11: overstated
Context12: The same report
Context13: professionals
Context14: a necessary response
Context15: these evolving conditions
Context16: noted
Context17: upskilling


In [None]:
import re
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Example text
text = """
Despite concerns over job displacement, which were highlighted in a report by Hero Vired revealing that 82 percent of professionals fear job loss due to emerging technologies, the importance of upskilling cannot be overstated. The same report also noted that 78 percent of professionals view upskilling as a necessary response to these evolving conditions.
"""

# Step 1: Extract numerical trends
percentages = re.findall(r'\b(\d+(\.\d+)?)\s?(percent|%)\b', text)

# Step 2: Extract expanded contexts
def extract_contexts(percentages, text):
    results = []
    doc = nlp(text)

    for percentage_match in percentages:
        percentage = percentage_match[0]
        trend = f"{percentage}%"  # Standardize trend format
        trend_results = {"trend": trend, "contexts": []}

        # Find sentence containing the percentage
        for sent in doc.sents:
            if percentage in sent.text:
                # Extract all possible phrases and keywords
                expanded_contexts = []
                for chunk in sent.noun_chunks:
                    expanded_contexts.append(chunk.text.strip())  # Noun phrases
                for token in sent:
                    if token.dep_ in {"ROOT", "dobj", "amod", "nsubj", "pobj"}:
                        expanded_contexts.append(token.text.strip())  # Key tokens

                # Remove duplicates and include all initially
                trend_results["contexts"] = list(set(expanded_contexts))

        results.append(trend_results)

    return results

# Step 3: Filter and rank contexts by relevance
def filter_contexts(results):
    filtered_results = []

    for result in results:
        trend = result["trend"]
        contexts = result["contexts"]

        # Relevance keywords (adjust based on your domain)
        relevant_keywords = ["job", "upskilling", "professionals", "technologies", "loss", "response"]

        # Rank contexts by overlap with relevant keywords
        ranked_contexts = [
            context
            for context in contexts
            if any(keyword in context.lower() for keyword in relevant_keywords)
        ]

        # Add back to results
        filtered_results.append({"trend": trend, "contexts": ranked_contexts[:5]})  # Limit to top 3 contexts

    return filtered_results

# Extract and filter
raw_results = extract_contexts(percentages, text)
final_results = filter_contexts(raw_results)

# Display results
for res in final_results:
    print(f"Trend: {res['trend']}")
    for i, context in enumerate(res["contexts"], 1):
        print(f"Context{i}: {context}")


Trend: 82%
Context1: job loss
Context2: loss
Context3: professionals
Context4: upskilling
Context5: job displacement
Trend: 78%
Context1: a necessary response
Context2: professionals
Context3: response


In [None]:
import spacy
import re
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# Load SpaCy and BERT model for tokenization
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example text
text = """
Despite concerns over job displacement, which were highlighted in a report by Hero Vired revealing that 82 percent of professionals fear job loss due to emerging technologies, the importance of upskilling cannot be overstated. The same report also noted that 78 percent of professionals view upskilling as a necessary response to these evolving conditions.
"""

# Extract numerical trends
percentages = re.findall(r'\b(\d+(\.\d+)?)\s?(percent|%)\b', text)

# Step 1: Tokenize and encode sentences to extract embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Step 2: Extract all possible contexts
def extract_contexts(percentages, text):
    results = []
    doc = nlp(text)

    for percentage_match in percentages:
        percentage = percentage_match[0]
        trend = f"{percentage}%"  # Standardize trend format
        trend_results = {"trend": trend, "contexts": []}

        # Find sentence containing the percentage
        for sent in doc.sents:
            if percentage in sent.text:
                # Extract all possible phrases and keywords
                expanded_contexts = []
                for chunk in sent.noun_chunks:
                    expanded_contexts.append(chunk.text.strip())  # Noun phrases
                for token in sent:
                    if token.dep_ in {"ROOT", "dobj", "amod", "nsubj", "pobj"}:
                        expanded_contexts.append(token.text.strip())  # Key tokens

                # Remove duplicates and include all initially
                trend_results["contexts"] = list(set(expanded_contexts))

        results.append(trend_results)

    return results

# Step 3: Rank contexts based on semantic similarity to the trend
def rank_contexts_by_similarity(results):
    filtered_results = []

    for result in results:
        trend = result["trend"]
        trend_embedding = get_embeddings(trend)  # Get embedding for trend

        # Rank contexts based on cosine similarity
        context_embeddings = []
        for context in result["contexts"]:
            context_embedding = get_embeddings(context)
            context_embeddings.append(context_embedding)

        # Compute cosine similarity
        similarities = cosine_similarity([trend_embedding], context_embeddings)[0]

        # Rank contexts by similarity score
        ranked_contexts = [result["contexts"][i] for i in np.argsort(similarities)[::-1]]

        filtered_results.append({"trend": trend, "contexts": ranked_contexts[:3]})  # Top 3 contexts

    return filtered_results

# Extract contexts and rank by relevance
raw_results = extract_contexts(percentages, text)
final_results = rank_contexts_by_similarity(raw_results)

# Display results
for res in final_results:
    print(f"Trend: {res['trend']}")
    for i, context in enumerate(res["contexts"], 1):
        print(f"Context{i}: {context}")


Trend: 82%
Context1: 82 percent
Context2: percent
Context3: loss
Trend: 78%
Context1: 78 percent
Context2: percent
Context3: noted


In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import string

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
csv_file_path = "articles.csv"  # Replace with the path to your file
df = pd.read_csv(csv_file_path, skiprows=3)

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f"[{string.punctuation}]", " ", text)
    # Tokenize words
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to each article
df["processed_content"] = df["Article Text"].apply(preprocess_text)

In [None]:
df.head()

Unnamed: 0,Date,Title,Author,Publication Date,Article Text,Link,Source URL,Keywords,Job Market Insights,processed_content
0,2024-11-21,Are campuses ready to cater to tech industry's...,Business Standard,2024-10-23 18:14:35.000,Commentary from leading IT services players fo...,https://www.business-standard.com/technology/t...,www.business-standard.com,"[('the', 3), ('hiring', 3), ('and', 3), ('that...",General job trends,"[commentary, leading, services, players, follo..."
1,2024-11-21,Japan’s stock market is producing too many ‘pu...,Leo Lewis,2024-10-23 23:01:56.613,Print this page\nWhen EcoNaviSta listed on Tok...,https://www.ft.com/content/48a66925-64b7-429c-...,www.ft.com,"[('the', 45), ('of', 24), ('and', 22), ('in', ...",General job trends,"[print, page, econavista, listed, tokyo, ’, ne..."
2,2024-11-21,India’s AI rush thrusts Nvidia to centrestage,"Soumyarendra Barik, Soumyarendra Barik Is Spec...",2024-10-25 12:14:33.000,Reliance and chipmaker Nvidia have announced a...,https://indianexpress.com/article/business/ind...,indianexpress.com,"[('the', 34), ('to', 19), ('and', 15), ('in', ...",Tech industry growth,"[reliance, chipmaker, nvidia, announced, joint..."
3,2024-11-21,Nvidia overtakes Apple as world's most valuabl...,Authors,2024-10-26 05:05:19.000,Nvidia dethroned Apple as the world's most val...,https://www.thehindu.com/sci-tech/technology/n...,www.thehindu.com,"[('the', 32), ('a', 18), ('in', 16), ('nvidia'...",Tech industry growth,"[nvidia, dethroned, apple, world, valuable, co..."
4,2024-11-21,"FPIs withdraw ₹85,790 crore from Indian equiti...",Authors,2024-10-27 06:20:23.000,Foreign investors have continued selling in th...,https://www.thehindu.com/business/Economy/fpis...,www.thehindu.com,"[('the', 25), ('in', 16), ('and', 13), ('of', ...",General job trends,"[foreign, investors, continued, selling, india..."


In [None]:
corpus = df["processed_content"].tolist()  # Convert column to a list of tokenized sentences

# Initialize and train the Word2Vec model
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4, sg=1)  # sg=1 for skip-gram

# Step 4: Save the model
model.save("word2vec_custom.model")
print("Word2Vec model trained and saved as 'word2vec_custom.model'.")

Word2Vec model trained and saved as 'word2vec_custom.model'.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def check_words_in_vocab(model, words):
    for word in words:
        if word not in model.wv:
            raise ValueError(f"The word '{word}' is not in the Word2Vec vocabulary.")

# Function to compute cosine similarity
def get_cosine_similarity(model, word1, word2):
    vec1 = model.wv[word1].reshape(1, -1)  # Vector for word1
    vec2 = model.wv[word2].reshape(1, -1)  # Vector for word2
    return cosine_similarity(vec1, vec2)[0][0]

# Words to compare
word_pairs = [
    ("job market", "increase"),
    ("job market", "decline"),
    ("ai", "increase"),
    ("ai", "decline")
]

# Check if all words are in the vocabulary
check_words_in_vocab(model, [w for pair in word_pairs for w in pair])

# Calculate and print cosine similarities
for word1, word2 in word_pairs:
    similarity = get_cosine_similarity(model, word1, word2)
    print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.4f}")

ValueError: The word 'job market' is not in the Word2Vec vocabulary.

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Initialize counters
total_sentences = 0
total_words = 0

# Iterate over all articles in the content column
for article in df['Article Text']:
    # Tokenize into sentences and count
    sentences = sent_tokenize(article)
    total_sentences += len(sentences)

    # Tokenize into words and count
    words = word_tokenize(article)
    total_words += len(words)

# Print the corpus size
print(f"Total number of sentences in the corpus: {total_sentences}")
print(f"Total number of words in the corpus: {total_words}")


Total number of sentences in the corpus: 1957
Total number of words in the corpus: 46122


In [None]:
!pip install pandas openpyxl



In [None]:
import pandas as pd
df = pd.read_excel('Articles.xlsx', engine='openpyxl')

In [None]:
df.head()

Unnamed: 0,Date,Title,Author,Publication Date,Article Text,Link,Source URL
0,20241001,Best MacBooks (2024): Which Model Should You Buy?,"Brenda Stolyar, Julian Chokkattu, Molly Higgin...",2018-02-08 13:00:00,"in 2020, Apple’s MacBooks entered a new era. T...",https://www.wired.com/story/which-macbook-shou...,https://www.wired.com
1,20240401,The Best VPNs to Protect Yourself Online,"Scott Gilbertson, Simon Hill, David Nield, Bre...",2020-03-04 13:00:00,A virtual private network (VPN) is like a prot...,https://www.wired.com/story/best-vpn/,https://www.wired.com
2,20231101,Everything You Need to Work From Home Like a Pro,"Julian Chokkattu, Gear Team, Medea Giordano, S...",2020-04-03 11:00:00,No matter if you're working from home full-tim...,https://www.wired.com/story/work-from-home-hom...,https://www.wired.com
3,20240201,How to Back Up Your Digital Life,"Scott Gilbertson, Simon Hill, David Nield, Jul...",2020-07-25 12:00:00,"To get started, click Add backup, and Duplicat...",https://www.wired.com/story/how-to-back-up-you...,https://www.wired.com
4,20240801,How to Choose the Right Laptop: A Step-by-Step...,"Scott Gilbertson, Simon Hill, Brenda Stolyar, ...",2021-03-26 14:00:00,Buying a laptop is an exercise in confusion. E...,https://www.wired.com/story/how-to-buy-the-rig...,https://www.wired.com


In [None]:
import gensim

In [None]:
df['ProcessedArticleText'] = df['Article Text'].apply(gensim.utils.simple_preprocess)

In [None]:
review_text = df['ProcessedArticleText']

In [None]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [None]:
model.build_vocab(review_text, progress_per=1000)

In [None]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(1571072, 2014895)

In [None]:
model.wv.most_similar("pascal")

[('south', 0.9899526238441467),
 ('status', 0.9894944429397583),
 ('john', 0.9894832372665405),
 ('sag', 0.9894829392433167),
 ('antitrust', 0.9893879890441895),
 ('vehicles', 0.9891685843467712),
 ('employment', 0.9891316294670105),
 ('association', 0.9889876842498779),
 ('red', 0.9889872670173645),
 ('linkedin', 0.9888119101524353)]

In [None]:
model.wv.similarity(w1="emerging", w2="pascal")

0.94609356

In [None]:
model.wv.similarity(w1="emerging", w2="intelligence")

0.80068666

In [None]:
model.wv.similarity(w1="emerging", w2="machine")

0.8767104

In [None]:
model.wv.similarity(w1="emerging", w2="blockchain")

0.93003863

In [None]:
vocabulary_words = list(model.wv.key_to_index.keys())

# Display the first 10 words for verification
print(vocabulary_words[:10])

# Total number of words in the vocabulary
print(f"Total vocabulary size: {len(vocabulary_words)}")

['the', 'to', 'and', 'of', 'in', 'that', 'it', 'is', 'for', 'ai']
Total vocabulary size: 11382


In [None]:
model_path = 'path_to_your_downloaded_model/GoogleNews-vectors-negative300.bin'  # For Google News model
# Or use another model path if you're using a different one
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

# Test the model with a similarity query
similarity = model.similarity('king', 'queen')
print(f"Similarity between 'king' and 'queen': {similarity}")

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_downloaded_model/GoogleNews-vectors-negative300.bin'