lexical semantics

In [None]:
import re
from textblob import Word
import nltk # import nltk

# Download the 'FRAMENET' dataset using nltk.download()
nltk.download('FRAMENET')  # Download wordnet

with open('word.txt', 'r') as file:
    text = file.read()

# Use regex to split the text into words, ignoring punctuation and converting to lowercase
words = re.findall(r'\b\w+\b', text.lower())

# Process each word for synonyms and antonyms
for word_str in set(words):  # Use set to avoid duplicate words or to use unique
    word = Word(word_str)

    synonyms = word.synsets

    antonyms = []
    for synset in synonyms:
        for lemma in synset.lemmas():
            if lemma.antonyms():
                antonyms.append(lemma.antonyms()[0].name())

    # Output the results
    print(f"Word: '{word_str}'")
    print(f"  Synonyms: {[lemma.name() for synset in synonyms for lemma in synset.lemmas()]}")
    print(f"  Antonyms: {antonyms}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Word: 'upon'
  Synonyms: []
  Antonyms: []
Word: 'managing'
  Synonyms: ['pull_off', 'negociate', 'bring_off', 'carry_off', 'manage', 'manage', 'deal', 'care', 'handle', 'cope', 'get_by', 'make_out', 'make_do', 'contend', 'grapple', 'deal', 'manage', 'oversee', 'supervise', 'superintend', 'manage', 'wangle', 'finagle', 'manage', 'do', 'manage', 'wield', 'handle', 'manage']
  Antonyms: ['fail']
Word: 'derive'
  Synonyms: ['deduce', 'infer', 'deduct', 'derive', 'derive', 'gain', 'derive', 'derive', 'educe', 'derive', 'come', 'descend']
  Antonyms: []
Word: 'language'
  Synonyms: ['language', 'linguistic_communication', 'speech', 'speech_communication', 'spoken_communication', 'spoken_language', 'language', 'voice_communication', 'oral_communication', 'lyric', 'words', 'language', 'linguistic_process', 'language', 'language', 'speech', 'terminology', 'nomenclature', 'language']
  Antonyms: []
Word: 'person'
  Synonyms: ['person', 'individual', 'someone', 'somebody', 'mortal', 'soul', 'per

PMI

In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from collections import Counter

# Function to scrape text from a webpage
def scrape_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all the text content from the page
    paragraphs = soup.find_all('p')
    text = ' '.join([para.get_text() for para in paragraphs])

    return text

# Example URL to scrape
url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
scraped_text = scrape_text_from_url(url)

# Preprocess scraped text
words = scraped_text.split()  # Splitting text into words
word_freq = Counter(words)    # Count word frequencies
total_words = len(words)      # Total number of words in the corpus
print(f"Total words: {total_words}")

# Function to calculate PMI
def pmi(word1, word2):
    # Calculate joint frequency (how many sentences contain both words)
    sentences = scraped_text.split('.')
    joint_freq = sum(1 for sentence in sentences if word1 in sentence and word2 in sentence)

    # Calculate probabilities
    p_word1 = word_freq[word1] / total_words
    p_word2 = word_freq[word2] / total_words
    p_joint = joint_freq / len(sentences)

    # Calculate PMI
    return np.log2(p_joint / (p_word1 * p_word2)) if p_joint > 0 else None

# Example: Calculate PMI for 'language' and 'processing'
word1, word2 = 'language', 'processing'
result = pmi(word1, word2)
print(f"PMI({word1}, {word2}) = {result}")

Total words: 1086
PMI(language, processing) = 10.656283951868597


In [None]:
import http.client
import re
import numpy as np
from collections import Counter

# Function to scrape text from a webpage
def scrape_text_from_url(url):
    # Parse the URL
    protocol, _, domain, path = url.split('/', 3)
    conn = http.client.HTTPSConnection(domain)  # Use HTTPConnection for http URLs
    conn.request("GET", "/" + path)
    response = conn.getresponse()

    # Read and decode the response
    html = response.read().decode('utf-8')

    # Use regex to extract text from <p> tags
    paragraphs = re.findall(r'<p>(.*?)</p>', html, re.DOTALL)
    text = ' '.join([re.sub(r'<.*?>', '', para) for para in paragraphs])  # Remove HTML tags

    return text

# Example URL to scrape
url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
scraped_text = scrape_text_from_url(url)

# Preprocess scraped text
words = re.findall(r'\b\w+\b', scraped_text.lower())  # Extract words and convert to lowercase
word_freq = Counter(words)                            # Count word frequencies
total_words = len(words)                              # Total number of words in the corpus
print(f"Total words: {total_words}")

# Function to calculate PMI
def pmi(word1, word2):
    # Calculate joint frequency (how many sentences contain both words)
    sentences = re.split(r'[.!?]', scraped_text)  # Split text into sentences
    joint_freq = sum(1 for sentence in sentences if word1 in sentence and word2 in sentence)

    # Calculate probabilities
    p_word1 = word_freq[word1] / total_words
    p_word2 = word_freq[word2] / total_words
    p_joint = joint_freq / len(sentences)

    # Calculate PMI
    return np.log2(p_joint / (p_word1 * p_word2)) if p_joint > 0 else None

# List of word pairs to calculate PMI for
word_pairs = [('language', 'processing'), ('natural', 'language'), ('cat', 'dog'), ('model', 'training')]

# Calculate PMI for each word pair
for word1, word2 in word_pairs:
    result = pmi(word1, word2)
    if result is not None:
        print(f"PMI({word1}, {word2}) = {result}")
    else:
        print(f"PMI({word1}, {word2}) could not be calculated.")


Total words: 1221
PMI(language, processing) = 10.052778832460003
PMI(natural, language) = 9.775244856931094
PMI(cat, dog) could not be calculated.
PMI(model, training) could not be calculated.


VECTOR SEMANTICS

In [None]:
import spacy

# Load the small English model
nlp = spacy.load('en_core_web_sm')

# Small dataset (manually entered sentences)
small_sentences = [
    'the cat sat on the mat',
    'dogs are better than cats',
    'the dog sat on the rug'
]

# Process sentences and get vectors for each word
for sentence in small_sentences:
    doc = nlp(sentence)
    print(f"\nSentence: {sentence}")
    for token in doc:
        print(f"Word: {token.text}, Vector: {token.vector[:5]}")  # Truncated vector output


Sentence: the cat sat on the mat
Word: the, Vector: [ 1.2190363  -0.47780147 -0.02257838  1.8145945  -1.2888143 ]
Word: cat, Vector: [-0.49857017 -0.46915686 -0.7442478   0.39140272  0.29465148]
Word: sat, Vector: [-0.04254475  0.9221249   0.60993016 -0.87227887 -0.29893935]
Word: on, Vector: [ 0.31212795  0.42844406  0.36984378 -0.95197976  0.3493223 ]
Word: the, Vector: [ 1.0240833  -0.03962332 -1.1507094   1.0001687   0.58538896]
Word: mat, Vector: [ 0.7723417 -1.6352618  0.1364181 -1.206529  -0.4052785]

Sentence: dogs are better than cats
Word: dogs, Vector: [-0.41797087  0.99235207  0.5725043   0.77076995  0.71824205]
Word: are, Vector: [ 0.09023796  0.62270015 -0.709143   -0.2652374  -0.28411064]
Word: better, Vector: [ 0.9536653   0.04835802  0.2381073  -1.2686076  -1.4762766 ]
Word: than, Vector: [ 0.36181647 -0.3444981  -0.41358137 -0.7631421  -0.24953336]
Word: cats, Vector: [-1.1452069   0.29329747 -1.1624678   1.1739498   0.6667417 ]

Sentence: the dog sat on the rug
Word

In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import requests
from bs4 import BeautifulSoup
import spacy

# Load the medium English model
nlp = spacy.load('en_core_web_md')

# Web scraping to collect text from a website (e.g., Wikipedia)
def scrape_webpage(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting paragraphs from the webpage
    paragraphs = soup.find_all('p')
    text = " ".join([para.get_text() for para in paragraphs])
    return text

# Scraping data from multiple Wikipedia pages (big dataset)
urls = [
    'https://en.wikipedia.org/wiki/Machine_learning',
    'https://en.wikipedia.org/wiki/Artificial_intelligence',
    'https://en.wikipedia.org/wiki/Natural_language_processing'
]

# Collecting text from the pages
large_text_data = [scrape_webpage(url) for url in urls]

# Process scraped text using spaCy
for text in large_text_data:
    doc = nlp(text[:500])  # Processing first 500 characters (for example)
    print(f"\nSample text: {text[:100]}...")  # Print the first 100 characters
    for token in doc:
        print(f"Word: {token.text}, Vector: {token.vector[:5]}")  # Truncated vector


Sample text: Machine learning (ML) is a field of study in artificial intelligence concerned with the development ...
Word: Machine, Vector: [-2.3291  -0.47575  4.2706   6.4718   3.7222 ]
Word: learning, Vector: [ 0.52415 -0.11843  0.99918 -0.92592  0.58506]
Word: (, Vector: [ -7.8392   -10.463     11.759      1.8131     0.046077]
Word: ML, Vector: [ 0.61869 12.587   16.028    4.7017  -3.5819 ]
Word: ), Vector: [-7.9729 -8.4028 11.931   1.4241 -6.2634]
Word: is, Vector: [ 1.475   6.0078  1.1205 -3.5874  3.7638]
Word: a, Vector: [-9.3629  9.2761 -7.2708  4.3879 10.316 ]
Word: field, Vector: [-0.96889  5.7225  -0.24339 -1.2014   5.5839 ]
Word: of, Vector: [-12.667    -6.568    -0.61537   4.9492   22.389  ]
Word: study, Vector: [-0.64776 -1.3091   2.0104  -1.2484  -2.4662 ]
Word: in, Vector: [-3.7766   0.69426 -3.3805   2.705    8.6019 ]
Word: artificial, Vector: [-1.5015  -0.71667 -0.62862  0.15832  2.285  ]
Word: intelligence, Vector: [-0.41817  0.3404  -1.1554   1.9992   3.1846 ]
Word: