# Importing the library

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# Reading .txt file and converting to dataframe

In [2]:
def dataframe_for_encoding(file_name):
    # Read the data from the file
    with open(file_name, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Initialize lists to store tokens and labels
    tokens = []
    labels = []

    # Process the lines
    for line in lines:
        line = line.strip()
        if line:
            token, label = line.rsplit(' ', 1)
            tokens.append(token)
            labels.append(label)
        else:
            tokens.append('')
            labels.append('')

    # Create a DataFrame
    df = pd.DataFrame({'Token': tokens, 'Label': labels})
    return df

# Display the DataFrame
df = dataframe_for_encoding('/content/drive/MyDrive/NER_data/DanfeNER-train-bio.txt')
df.head()

Unnamed: 0,Token,Label
0,-DOCSTART-,O
1,,
2,जाँदा,O
3,जाँदै,O
4,",",O


# Inserting sentence id for identifing the particular sentence.

In [3]:
# Initialize sentence_id variable
sentence_id = 1

# Iterate through the DataFrame
for index, row in df.iterrows():
    if row['Token'] == '':
        # Assign sentence_id before the empty value
        df.at[index-1, 'SentenceID'] = f"sentence{int(sentence_id)}"
        sentence_id += 1
        # Delete the row with the empty value
        df = df.drop(index)
    else:
        df.at[index, 'SentenceID'] = f"sentence{int(sentence_id)}"

# Assign sentence_id for the last sentence (if applicable)
df.at[df.index[-1], 'SentenceID'] = f"sentence{int(sentence_id)}"

# Reset index after removing rows
df = df.reset_index(drop=True)

# Print the modified DataFrame
print(df)

             Token  Label    SentenceID
0       -DOCSTART-      O     sentence1
1            जाँदा      O     sentence2
2            जाँदै      O     sentence2
3                ,      O     sentence2
4       बाह्रबुंदे      O     sentence2
...            ...    ...           ...
115338       नम्बर      O  sentence6847
115339     टिप्नुस      O  sentence6847
115340   ९८७८९१२३४  PHONE  sentence6847
115341          हो      O  sentence6847
115342           ।      O  sentence6848

[115343 rows x 3 columns]


In [4]:
# Encoding the sentence_id using LabelEncoder class
df['SentenceID'] = LabelEncoder().fit_transform(df['SentenceID'])
df['Label'].unique()

array(['O', 'B-PER', 'B-ORG', 'B-DATE', 'I-DATE', 'I-ORG', 'I-PER',
       'B-LOC', 'I-LOC', 'B-EVENT', 'I-EVENT', 'PHONE', 'USERNAME'],
      dtype=object)

In [5]:
df2 = pd.read_csv("/content/drive/MyDrive/NER_data/NER.csv")

# Corrected the lambda function
df2['sentence_id'] = df2.apply(lambda x: 6481 + x['sentence_id'], axis=1)

df2.rename(columns={"sentence_id": "SentenceID", "words": "Token", "labels": "Label"}, inplace=True)

# Concatanate df and df2
df=pd.concat([df,df2], axis=0)
df.reset_index(inplace=True)


In [6]:
df.to_csv("/content/drive/MyDrive/NER_data/total_ner_data.csv")