In [1]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

text = "The contract between Birla Corporation and Hindustan Inc. was signed on March 10, 2020."
# Tokenize the text into words
tokens = nltk.word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens_filtered = [w for w in tokens if not w.lower() in stop_words]
length = len(tokens_filtered)
for i in range (1, length):
	if i in (',', '.', '_'):
		tokens_filtered.remove(tokens_filtered[i])
    

# Apply part-of-speech tagging to the tokens
tagged = nltk.pos_tag(tokens_filtered)

# Apply named entity recognition to the tagged words
entities = nltk.chunk.ne_chunk(tagged)

# Print the entities found in the text
for entity in entities:
	print(entity)

('contract', 'NN')
(PERSON Birla/NNP Corporation/NNP Hindustan/NNP)
('Inc.', 'NNP')
('signed', 'VBD')
('March', 'NNP')
('10', 'CD')
(',', ',')
('2020', 'CD')
('.', '.')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tr4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import nltk
from nltk.corpus import stopwords
from string import punctuation

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Sample text
text = "The contract between Birla Corporation and Hindustan Inc. was signed on March 10, 2020."

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
tokens_filtered = [w for w in tokens if not w.lower() in stop_words and w not in punctuation]

# Apply part-of-speech tagging to the filtered tokens
tagged = nltk.pos_tag(tokens_filtered)

# Apply named entity recognition to the tagged words
entities = nltk.chunk.ne_chunk(tagged)

# Print the named entities found in the text
for entity in entities:
    if hasattr(entity, 'label'):
        print(f"Entity: {' '.join(c[0] for c in entity.leaves())}, Type: {entity.label()}")



Entity: Birla Corporation Hindustan, Type: PERSON


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tr4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import nltk
from nltk.tokenize import word_tokenize

# Example training data (replace with your domain-specific data)
training_data = [
   ( "The contract between Birla Corporation and Hindustan Inc. was signed on March 10, 2020.",{"entities": [( 21, 38,"CORPORATION"), (43, 57, "CORPORATION"), (72, 86, "DATE")]}),
  ("The court in New York ruled in favor of Plaintiff in the case Doe v. Smith.",{"entities": [(13,21, "LOCATION"),(40, 49,"PERSON"),(62, 74, "PERSON")]}),
  ("The shareholders agreed to invest $5 million in Willhome Ventures.",{"entities": [(34, 44, "CURRENCY"), (48, 65,  "CORPORATION")]}),
  ("The contract stated that payment must be made within 30 days of signing.",{"entities": [(53, 60, "TIMEFRAME")]})
]

# Function to extract features from text
def features(tokens, index):
    return {
        'word': tokens[index],
        'is_first': index == 0,
        'is_last': index == len(tokens) - 1,
        'is_capitalized': tokens[index][0].upper() == tokens[index][0],
        'is_all_caps': tokens[index].upper() == tokens[index],
        'is_all_lower': tokens[index].lower() == tokens[index],
        'prefix-1': tokens[index][0],
        'prefix-2': tokens[index][:2],
        'prefix-3': tokens[index][:3],
        'suffix-1': tokens[index][-1],
        'suffix-2': tokens[index][-2:],
        'suffix-3': tokens[index][-3:],
        'prev_word': '' if index == 0 else tokens[index - 1],
        'next_word': '' if index == len(tokens) - 1 else tokens[index + 1],
        'has_hyphen': '-' in tokens[index],
        'is_numeric': tokens[index].isdigit(),
        'capitals_inside': tokens[index][1:].lower() != tokens[index][1:]
    }

# Function to transform data into features and labels
def transform_to_dataset(training_data):
    X, y = [], []
    for sentence, entities in training_data:
        tokens = word_tokenize(sentence)
        entities_list = entities.get('entities', [])
        for index in range(len(tokens)):
            token_features = features(tokens, index)
            X.append((token_features, tokens[index]))
            entity_label = 'O'  # Default label if no entity
            for start, end, label in entities_list:
                if index >= start and index < end:
                    entity_label = label
                    break
            y.append(entity_label)
    return X, y

# Convert training data to NLTK's format
X, y = transform_to_dataset(training_data)

# Train a classifier (e.g., MaxEnt classifier)
classifier = nltk.MaxentClassifier.train(X)

# Example of predicting entities
sentence = "The Birla Corporation is legally entitled"
tokens = word_tokenize(sentence)
predicted_entities = [classifier.classify(features(tokens, index)) for index in range(len(tokens))]
print(predicted_entities)


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.87120        0.017
             2          -1.68949        1.000
             3          -1.03845        1.000
             4          -0.71560        1.000
             5          -0.53704        1.000
             6          -0.42682        1.000
             7          -0.35291        1.000
             8          -0.30023        1.000
             9          -0.26093        1.000
            10          -0.23055        1.000
            11          -0.20640        1.000
            12          -0.18677        1.000
            13          -0.17050        1.000
            14          -0.15681        1.000
            15          -0.14514        1.000
            16          -0.13507        1.000
            17          -0.12629        1.000
            18          -0.11858        1.000
            19          -0.11175        1.000
 

In [6]:
import nltk
from nltk.tokenize import word_tokenize

# Example training data (replace with your domain-specific data)
training_data = [
    ("John works at Apple Inc. in California.", {"entities": [(0, 4, "PERSON"), (14, 23, "ORG"), (27, 37, "GPE")]}),
    ("Jane is studying at Stanford University in California.", {"entities": [(0, 4, "PERSON"), (20, 38, "ORG"), (42, 52, "GPE")]}),
    # Add more training examples as needed
]

# Function to extract features from text
def features(tokens, index):
    return {
        'word': tokens[index],
        'is_first': index == 0,
        'is_last': index == len(tokens) - 1,
        'is_capitalized': tokens[index][0].upper() == tokens[index][0],
        'is_all_caps': tokens[index].upper() == tokens[index],
        'is_all_lower': tokens[index].lower() == tokens[index],
        'prefix-1': tokens[index][0],
        'prefix-2': tokens[index][:2],
        'prefix-3': tokens[index][:3],
        'suffix-1': tokens[index][-1],
        'suffix-2': tokens[index][-2:],
        'suffix-3': tokens[index][-3:],
        'prev_word': '' if index == 0 else tokens[index - 1],
        'next_word': '' if index == len(tokens) - 1 else tokens[index + 1],
        'has_hyphen': '-' in tokens[index],
        'is_numeric': tokens[index].isdigit(),
        'capitals_inside': tokens[index][1:].lower() != tokens[index][1:]
    }

# Function to transform data into features and labels
def transform_to_dataset(training_data):
    X, y = [], []
    for sentence, entities in training_data:
        tokens = word_tokenize(sentence)
        entities_list = entities.get('entities', [])
        for index in range(len(tokens)):
            token_features = features(tokens, index)
            X.append((token_features, tokens[index]))
            entity_label = 'O'  # Default label if no entity
            for start, end, label in entities_list:
                if index >= start and index < end:
                    entity_label = label
                    break
            y.append(entity_label)
    return X, y

# Convert training data to NLTK's format
X, y = transform_to_dataset(training_data)

# Train a classifier (e.g., MaxEnt classifier)
classifier = nltk.MaxentClassifier.train(X, max_iter=10)

# Example of predicting entities
sentence = "John works at Apple Inc. in California."
tokens = word_tokenize(sentence)
predicted_entities = [classifier.classify(features(tokens, index)) for index in range(len(tokens))]
print(predicted_entities)


  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.56495        0.059
             2          -1.26425        1.000
             3          -0.81614        1.000
             4          -0.58791        1.000
             5          -0.45516        1.000
             6          -0.36970        1.000
             7          -0.31053        1.000
             8          -0.26732        1.000
             9          -0.23446        1.000
         Final          -0.20868        1.000
['John', 'works', 'at', 'Apple', 'Inc.', 'in', 'California', '.']
