#Loading the dataset

Please run the cells one by one instead of all at once

In [1]:
import pandas as pd

train_dataset_github_link = "https://raw.githubusercontent.com/SadiulArefin/SemEval-2023-English-Named-Entity-Recognition/main/en-train.csv"

data = pd.read_csv(train_dataset_github_link)

In [2]:
data.head(30)

Unnamed: 0,Sentence#,Word,Word.1,Tag,Unnamed: 4
0,,,,,
1,-1.0,# id 309f5b26-951e-472b-948e-47632249862b,#,id,309f5b26-951e-472b-948e-47632249862b
2,0.0,robert _ _ B-OtherPER,robert,B-OtherPER,
3,0.0,gottschalk _ _ I-OtherPER,gottschalk,I-OtherPER,
4,0.0,1939 _ _ O,1939,O,
5,0.0,academy _ _ B-VisualWork,academy,B-VisualWork,
6,0.0,award _ _ I-VisualWork,award,I-VisualWork,
7,0.0,winner _ _ O,winner,O,
8,0.0,and _ _ O,and,O,
9,0.0,founder _ _ O,founder,O,


#Data Preprocessing

In [3]:
data.drop('Word', inplace=True, axis=1)

In [4]:
data.drop('Unnamed: 4', inplace=True, axis=1)

In [5]:
data = data.dropna(subset=['Tag'])

In [6]:
data = data[data['Tag'] != 'id']

In [7]:
data['Sentence#'] = data['Sentence#'].astype(int)
data.head(30)

Unnamed: 0,Sentence#,Word.1,Tag
2,0,robert,B-OtherPER
3,0,gottschalk,I-OtherPER
4,0,1939,O
5,0,academy,B-VisualWork
6,0,award,I-VisualWork
7,0,winner,O
8,0,and,O
9,0,founder,O
10,0,of,O
11,0,panavision,B-ORG


#Loading the libraries


In [8]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.6.0 (from simpletransformers)
  Downloading transformers-4.29.0-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  P

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
from simpletransformers.ner import NERModel,NERArgs

In [11]:
data["Sentence#"] = LabelEncoder().fit_transform(data["Sentence#"])

In [12]:
data.rename(columns={"Sentence#":"sentence_id","Word.1":"words","Tag":"labels"}, inplace =True)

In [13]:
data["labels"] = data["labels"].str.upper()

In [14]:
data.head(30)

Unnamed: 0,sentence_id,words,labels
2,0,robert,B-OTHERPER
3,0,gottschalk,I-OTHERPER
4,0,1939,O
5,0,academy,B-VISUALWORK
6,0,award,I-VISUALWORK
7,0,winner,O
8,0,and,O
9,0,founder,O
10,0,of,O
11,0,panavision,B-ORG


In [15]:
labels = data["labels"].unique().tolist()
labels

['B-OTHERPER',
 'I-OTHERPER',
 'O',
 'B-VISUALWORK',
 'I-VISUALWORK',
 'B-ORG',
 'B-ARTIST',
 'I-ARTIST',
 'B-HUMANSETTLEMENT',
 'B-WRITTENWORK',
 'B-SOFTWARE',
 'I-SOFTWARE',
 'I-WRITTENWORK',
 'B-POLITICIAN',
 'I-POLITICIAN',
 'B-ATHLETE',
 'I-ATHLETE',
 'B-MUSICALWORK',
 'I-MUSICALWORK',
 'I-HUMANSETTLEMENT',
 'B-FACILITY',
 'I-FACILITY',
 'B-SCIENTIST',
 'I-SCIENTIST',
 'B-CLERIC',
 'I-CLERIC',
 'I-ORG',
 'B-SPORTSGRP',
 'B-MUSICALGRP',
 'I-MUSICALGRP',
 'B-SPORTSMANAGER',
 'I-SPORTSMANAGER',
 'B-PUBLICCORP',
 'I-PUBLICCORP',
 'B-OTHERPROD',
 'B-MEDICALPROCEDURE',
 'I-MEDICALPROCEDURE',
 'B-ARTWORK',
 'I-ARTWORK',
 'B-FOOD',
 'I-FOOD',
 'B-STATION',
 'I-STATION',
 'I-OTHERPROD',
 'B-CARMANUFACTURER',
 'B-OTHERLOC',
 'I-OTHERLOC',
 'B-PRIVATECORP',
 'I-SPORTSGRP',
 'B-DISEASE',
 'B-VEHICLE',
 'I-VEHICLE',
 'I-PRIVATECORP',
 'B-MEDICATION/VACCINE',
 'B-SYMPTOM',
 'I-MEDICATION/VACCINE',
 'I-DISEASE',
 'B-ANATOMICALSTRUCTURE',
 'I-ANATOMICALSTRUCTURE',
 'I-SYMPTOM',
 'B-AEROSPACEMANUF

#Training the model

In [16]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32
#args.use_cuda = False

In [17]:
model = NERModel('bert', 'bert-large-uncased',labels=labels,args =args)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [18]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [20]:
#building up train data and eval data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [21]:
model.train_model(train_data, eval_data = test_data,acc=accuracy_score)

  return [


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/525 [00:00<?, ?it/s]



(525, 0.7326019595918202)

#Model evaluation

In [22]:
test_dataset_github_link = "https://raw.githubusercontent.com/SadiulArefin/SemEval-2023-English-Named-Entity-Recognition/main/en-test.csv"

test = pd.read_csv(test_dataset_github_link)

In [23]:
test.drop('Unnamed: 4', inplace=True, axis=1)

In [24]:
test = test.dropna(subset=['Tag'])
test = test[test['Tag'] != 'id']
test.drop('dummy', inplace=True, axis=1)

In [25]:
test["Tag"] = test["Tag"].str.upper()

In [26]:
test.head(30)

Unnamed: 0,Sentence#,Word,Tag
1,0,eli,B-OTHERPER
2,0,lilly,I-OTHERPER
3,0,founder,O
4,0,president,O
5,0,of,O
6,0,pharmaceutical,O
7,0,company,O
8,0,eli,B-PUBLICCORP
9,0,lilly,I-PUBLICCORP
10,0,and,I-PUBLICCORP


In [27]:
test_sentences = test.groupby('Sentence#')['Word'].apply(lambda x: ' '.join(x)).reset_index(name='Sentence')

In [28]:
test_sentences.head(30)

Unnamed: 0,Sentence#,Sentence
0,0,eli lilly founder president of pharmaceutical ...
1,1,christoph haberland designed a new marble pulp...
2,2,he was succeeded as chancellor by sir frank ki...
3,3,it was described by edward meyrick in 1915 .
4,4,having suffered depredation of the more movabl...
5,5,it was described by charles joseph gahan in 18...
6,6,two important voices who applied incommensurab...
7,7,the adoption of the ( of charles v ) in 1532 m...
8,8,from 1995 to 2011 deal hudson was the magazine...
9,9,he inspired medical student alexander rich to ...


Making sentences 

In [29]:
sent_list = test_sentences['Sentence'].tolist()

In [30]:
print(sent_list)

['eli lilly founder president of pharmaceutical company eli lilly and company', 'christoph haberland designed a new marble pulpit for the church which was built in italy in 1793 .', 'he was succeeded as chancellor by sir frank kitto .', 'it was described by edward meyrick in 1915 .', 'having suffered depredation of the more movable stones of the site the monument was excavated by w j hemp in 1928 – 29 .', 'it was described by charles joseph gahan in 1894 .', 'two important voices who applied incommensurability to historical and philosophical notions of science in the 1960s are thomas kuhn and paul feyerabend .', 'the adoption of the ( of charles v ) in 1532 made inquisitional procedures empirical law .', 'from 1995 to 2011 deal hudson was the magazine s publisher .', 'he inspired medical student alexander rich to pursue an academic career .', 'it stars tomokazu sugita daisuke sakaguchi rie kugimiya among others .', 'the main event featured thales leites taking on jesse taylor in a midd

Predicting the results from the sentences

In [31]:
prediction, model_output = model.predict(sent_list)

  0%|          | 0/2 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/28 [00:00<?, ?it/s]

In [32]:
prediction[3]

[{'it': 'O'},
 {'was': 'O'},
 {'described': 'O'},
 {'by': 'O'},
 {'edward': 'B-OTHERPER'},
 {'meyrick': 'I-OTHERPER'},
 {'in': 'O'},
 {'1915': 'O'},
 {'.': 'O'}]

Processing the results for evaluation

In [33]:
words = []
tags = []

for sentence in prediction:
    for word_tag in sentence:
        word, tag = next(iter(word_tag.items()))
        words.append(word)
        tags.append(tag)

In [34]:
prediction_column = pd.DataFrame({'word': words, 'tag': tags})

In [35]:
prediction_column.head(30)

Unnamed: 0,word,tag
0,eli,B-SCIENTIST
1,lilly,I-PUBLICCORP
2,founder,O
3,president,O
4,of,O
5,pharmaceutical,I-PUBLICCORP
6,company,I-PUBLICCORP
7,eli,B-SCIENTIST
8,lilly,I-PUBLICCORP
9,and,O


In [36]:
test.head(30)

Unnamed: 0,Sentence#,Word,Tag
1,0,eli,B-OTHERPER
2,0,lilly,I-OTHERPER
3,0,founder,O
4,0,president,O
5,0,of,O
6,0,pharmaceutical,O
7,0,company,O
8,0,eli,B-PUBLICCORP
9,0,lilly,I-PUBLICCORP
10,0,and,I-PUBLICCORP


In [37]:
gold = test["Tag"].tolist()
print(gold)

['B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'B-PUBLICCORP', 'I-PUBLICCORP', 'I-PUBLICCORP', 'I-PUBLICCORP', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'B-OTHERPROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-HUMANSETTLEMENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SCIENTIST', 'I-SCIENTIST', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O

In [38]:
predict = prediction_column["tag"].tolist()
print(predict)

['B-SCIENTIST', 'I-PUBLICCORP', 'O', 'O', 'O', 'I-PUBLICCORP', 'I-PUBLICCORP', 'B-SCIENTIST', 'I-PUBLICCORP', 'O', 'I-PUBLICCORP', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-HUMANSETTLEMENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-POLITICIAN', 'B-POLITICIAN', 'I-POLITICIAN', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-POLITICIAN', 'I-POLITICIAN', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTHERPER', 'I-OTHERPER', 'O', 

#Final Result

In [39]:
from sklearn.metrics import f1_score

# Calculate the F1 score
f1 = f1_score(gold, predict, average='micro')

# Print the F1 score
print("F1 score:", f1)

F1 score: 0.8876379193875253
