# Baseline Model : Logistic Regression

In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
from sklearn.model_selection import GridSearchCV

In [83]:
data = pd.read_csv('ner_dataset.csv.gz', encoding='latin1')

# preprocessing
data['Sentence #'] = data['Sentence #'].fillna(method='ffill')


sentences = data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].tolist(), s['Tag'].tolist())])
sentences = [s for s in sentences]

# Train/Test and Validation set
train_sentences, test_sentences = train_test_split(sentences, test_size=0.2, random_state=42)
train_sentences, val_sentences = train_test_split(train_sentences, test_size=0.1, random_state=42)


  data['Sentence #'] = data['Sentence #'].fillna(method='ffill')
  sentences = data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].tolist(), s['Tag'].tolist())])


In [85]:
# feature creation
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'word.lower()': str(word).lower(),
        'word[-3:]': str(word)[-3:],
        'word[-2:]': str(word)[-2:],
        'word.isupper()': str(word).isupper(),
        'word.istitle()': str(word).istitle(),
        'word.isdigit()': str(word).isdigit(),
        'BOS': i == 0,
        'EOS': i == len(sent) - 1
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': str(word1).lower(),
            '-1:word.istitle()': str(word1).istitle(),
            '-1:word.isupper()': str(word1).isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': str(word1).lower(),
            '+1:word.istitle()': str(word1).istitle(),
            '+1:word.isupper()': str(word1).isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, label in sent]

def sent2tokens(sent):
    return [word for word, label in sent]

X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

X_val = [sent2features(s) for s in val_sentences]
y_val = [sent2labels(s) for s in val_sentences]

X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]


In [86]:
vec = DictVectorizer(sparse=True)

X_train_flat = [item for sublist in X_train for item in sublist]
y_train_flat = [item for sublist in y_train for item in sublist]

X_val_flat = [item for sublist in X_val for item in sublist]
y_val_flat = [item for sublist in y_val for item in sublist]

X_test_flat = [item for sublist in X_test for item in sublist]
y_test_flat = [item for sublist in y_test for item in sublist]

X_train_vect = vec.fit_transform(X_train_flat)
X_val_vect = vec.transform(X_val_flat)
X_test_vect = vec.transform(X_test_flat)


In [87]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100]
}

logreg = LogisticRegression(max_iter=100)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(logreg, param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train_vect, y_train_flat)

# Print the best parameters found
print(f"Best parameters found: {grid_search.best_params_}")

# Train the model with the best hyperparameters
best_logreg = grid_search.best_estimator_
best_logreg.fit(X_train_vect, y_train_flat)

# Save the model
joblib.dump(best_logreg, 'logreg_model.joblib')


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ..............................................C=0.1; total time=  48.8s
[CV] END ..............................................C=0.1; total time=  51.1s
[CV] END ..............................................C=0.1; total time=  51.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ................................................C=1; total time= 1.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ................................................C=1; total time= 1.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...............................................C=10; total time= 1.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ................................................C=1; total time= 1.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...............................................C=10; total time= 1.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...............................................C=10; total time= 1.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..............................................C=100; total time= 1.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..............................................C=100; total time= 1.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ..............................................C=100; total time= 1.0min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters found: {'C': 1}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['logreg_model.joblib']

In [88]:
y_pred = best_logreg.predict(X_test_vect)

# Evaluate the model
print(classification_report(y_test_flat, y_pred))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       B-art       0.30      0.03      0.06        94
       B-eve       0.62      0.26      0.36        70
       B-geo       0.85      0.91      0.88      7558
       B-gpe       0.95      0.94      0.94      3142
       B-nat       0.55      0.28      0.37        40
       B-org       0.79      0.67      0.73      4151
       B-per       0.82      0.81      0.81      3400
       B-tim       0.92      0.86      0.89      4077
       I-art       0.00      0.00      0.00        84
       I-eve       0.64      0.22      0.32        65
       I-geo       0.81      0.72      0.76      1462
       I-gpe       0.94      0.52      0.67        33
       I-nat       1.00      0.15      0.27        13
       I-org       0.78      0.73      0.75      3394
       I-per       0.83      0.87      0.85      3406
       I-tim       0.85      0.69      0.76      1251
           O       0.99      0.99      0.99    177590

    accuracy              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Pass a sample sentence the get the predicted NER tag for each word

In [93]:
def preprocess_input_string(input_string, vec):
    """
    Preprocess the input string and convert it to the format required by the model.
    """
    # Tokenize the input string
    tokens = input_string.split()
    
    # Extract features for each token
    features = [word2features([(token, '')], 0) for token in tokens]
    
    # Convert features to vector
    features_vector = vec.transform(features)
    
    return tokens, features_vector

# Example input string
input_string = 'Turkey has sent more troops and tanks to the Iraqi border , as speculation grows about a possible Turkish incursion against Kurdish rebels in northern Iraq .'

# Preprocess the input string
tokens, input_features = preprocess_input_string(input_string, vec)

# Predict NER tags
predicted_tags = best_logreg.predict(input_features)

# Display the results
for token, tag in zip(tokens, predicted_tags):
    print(f"{token}: {tag}")


Turkey: B-geo
has: O
sent: O
more: O
troops: O
and: O
tanks: O
to: O
the: O
Iraqi: B-gpe
border: O
,: O
as: O
speculation: O
grows: O
about: O
a: O
possible: O
Turkish: B-gpe
incursion: O
against: O
Kurdish: O
rebels: O
in: O
northern: O
Iraq: B-geo
.: O


In [91]:
# ' '.join([key for value in val_sentences[300:301]for key, val in value])


'Turkey has sent more troops and tanks to the Iraqi border , as speculation grows about a possible Turkish incursion against Kurdish rebels in northern Iraq .'

In [92]:
val_sentences[300:301]

[[('Turkey', 'B-org'),
  ('has', 'O'),
  ('sent', 'O'),
  ('more', 'O'),
  ('troops', 'O'),
  ('and', 'O'),
  ('tanks', 'O'),
  ('to', 'O'),
  ('the', 'O'),
  ('Iraqi', 'B-gpe'),
  ('border', 'O'),
  (',', 'O'),
  ('as', 'O'),
  ('speculation', 'O'),
  ('grows', 'O'),
  ('about', 'O'),
  ('a', 'O'),
  ('possible', 'O'),
  ('Turkish', 'B-gpe'),
  ('incursion', 'O'),
  ('against', 'O'),
  ('Kurdish', 'O'),
  ('rebels', 'O'),
  ('in', 'O'),
  ('northern', 'O'),
  ('Iraq', 'B-geo'),
  ('.', 'O')]]

# BERT Model

In [2]:
# install required Libraries 
! pip install transformers datasets torch



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('ner_dataset.csv.gz', encoding='latin1').fillna(method='ffill')

# preprocessing
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Call BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

  data = pd.read_csv('ner_dataset.csv.gz', encoding='latin1').fillna(method='ffill')


In [4]:
# Process the dataset
def process_data(data):
    sentences = []
    labels = []
    words = []
    tags = []
    for index, row in data.iterrows():
        if row['Word'] == '.':
            sentences.append(words)
            labels.append(tags)
            words = []
            tags = []
        else:
            words.append(row['Word'])
            tags.append(row['Tag'])
    return sentences, labels

sentences, labels = process_data(data)

In [5]:
tags_encoded = [[tag2idx[tag] for tag in sent] for sent in labels]

# Pad sequences
from keras.preprocessing.sequence import pad_sequences
tags_padded = pad_sequences(tags_encoded, maxlen=MAX_LEN, dtype="long", padding="post", truncating="post")

# Split data into train and test sets
train_inputs, test_inputs, train_tags, test_tags = train_test_split(sentences, tags_padded, random_state=42, test_size=0.1)


2024-06-02 12:43:32.795242: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-02 12:43:32.798286: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-02 12:43:32.834905: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
train_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_inputs],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in test_inputs],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


In [7]:
train_attention_masks = [[float(i != 0) for i in ii] for ii in train_input_ids]
test_attention_masks = [[float(i != 0) for i in ii] for ii in test_input_ids]


In [8]:
# Convert data to tensors
train_inputs = torch.tensor(train_input_ids)
train_tags = torch.tensor(train_tags)
train_masks = torch.tensor(train_attention_masks)

test_inputs = torch.tensor(test_input_ids)
test_tags = torch.tensor(test_tags)
test_masks = torch.tensor(test_attention_masks)

# Create DataLoader for train set
train_data = TensorDataset(train_inputs, train_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

In [9]:
# Create DataLoader for test set
test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)


In [10]:
# Load pre-trained BERT model
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(tag2idx))

# Define optimizer
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

# Maximum sequence length
MAX_LEN = 128

# Define batch size
bs = 16

# Define epochs
epochs = 1 # kept it lower so that it can run fast with min resource requirement (Caused Kernal to crash) 
import time

# Train the model
for epoch in range(epochs):
    model.train()
    total_loss = 0
    start_time = time.time()
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
        }
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if step % 100 == 0 and step != 0:
            print(f"Epoch {epoch + 1}, Batch {step}, Loss: {total_loss / step:.4f}, Time: {time.time() - start_time:.2f}s")
        if time.time() - start_time > 600:  # Stop training if exceeding 10 minutes
            print("Training stopped due to time limit.")
            break
    print(f"Epoch {epoch + 1} completed. Total training time: {time.time() - start_time:.2f}s")




Training stopped due to time limit.
Epoch 1 completed. Total training time: 607.38s


In [14]:
# Evaluate the model
model.eval()


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [15]:
predictions, true_labels = [], []
for batch in test_dataloader:
    inputs = {
        "input_ids": batch[0],
        "attention_mask": batch[1]
    }
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    logits = torch.argmax(logits, dim=2)
    logits = logits.detach().cpu().numpy()
    label_ids = batch[2].cpu().numpy()
    predictions.extend([list(p) for p in logits])
    true_labels.extend(label_ids)

In [4]:
# Flatten predictions and true labels
pred_tags = [tags_vals[p_i] for p, l in zip(predictions, true_labels)
                             for p_i, l_i in zip(p, l) if tags_vals[l_i] != "PAD"]
valid_tags = [tags_vals[l_i] for l in true_labels
                              for l_i in l if tags_vals[l_i] != "PAD"]

# Generate classification report
print(classification_report(valid_tags, pred_tags))

In [3]:
# Sample string
sample_string = "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country."

# Tokenize the sample string
tokenized_input = tokenizer.encode(sample_string, add_special_tokens=True, return_tensors="pt")

# Pass the tokenized input through the model
with torch.no_grad():
    output = model(tokenized_input)

# Get the predicted tags
predicted_tags = torch.argmax(output.logits, dim=2)[0].tolist()

# Decode the predicted tags
predicted_tags_decoded = [tags_vals[tag] for tag in predicted_tags]

# Print the sample string and predicted tags
print("Sample String:", sample_string)
print("Predicted Tags:", predicted_tags_decoded)
