In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification

# Load the dataset
data = pd.read_csv('/content/Big_Dataset.csv')

# Fix column name if necessary (to ensure consistency)
if 'Langid' in data.columns:
    data.rename(columns={'Langid': 'Langid'}, inplace=True)

# Define features and labels
X = data['Sentences'].dropna()  # Sentences/Texts
y = data['Langid'].dropna()  # Language labels (ensure correct column name)

# Split into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Create a mapping of unique language codes to integer labels
label_map = {label: idx for idx, label in enumerate(y.unique())}

# Convert labels to numerical format
y_train_encoded = y_train.map(label_map)
y_test_encoded = y_test.map(label_map)

# Load the pretrained multilingual BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", num_labels=len(label_map)
)

# Print label mapping for reference
print("Label Mapping:", label_map)

# Check the size of the training and test datasets
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label Mapping: {'te': 0, 'kn': 1, 'ml': 2, 'ta': 3, 'hi': 4, 'mr': 5, 'gu': 6, 'bn': 7, 'pa': 8, 'ur': 9, 'or': 10, 'sd': 11}
Training set size: 4200
Test set size: 1800


In [13]:
from transformers import BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

In [14]:
# 1. Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


In [15]:
# Tokenize the sentences (both train and test sets)
train_encodings = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors="pt")

In [16]:
# 2. Prepare DataLoader
# Convert the labels into tensors
train_labels = torch.tensor(y_train_encoded.values)
test_labels = torch.tensor(y_test_encoded.values)

In [17]:
# Create TensorDataset for both training and test datasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

In [18]:
# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [19]:
# 3. Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)




In [20]:
# 4. Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [21]:
epochs = 30


In [22]:
from tqdm import tqdm  # Import tqdm for progress bar

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_train_loss = 0

    # Wrap the train_loader with tqdm for a progress bar
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch"):
        # Get the inputs and labels
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Calculate the loss
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass (compute gradients)
        loss.backward()

        # Update model parameters
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Training loss: {avg_train_loss}")


Epoch 1/30: 100%|██████████| 132/132 [01:27<00:00,  1.52batch/s]


Epoch 1/30 - Training loss: 1.7043093284874251


Epoch 2/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 2/30 - Training loss: 0.5891606523232027


Epoch 3/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 3/30 - Training loss: 0.29541441566790594


Epoch 4/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 4/30 - Training loss: 0.14587096638525976


Epoch 5/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 5/30 - Training loss: 0.08352493717702049


Epoch 6/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 6/30 - Training loss: 0.05724335122486633


Epoch 7/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 7/30 - Training loss: 0.0354004738513719


Epoch 8/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 8/30 - Training loss: 0.02395020620403529


Epoch 9/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 9/30 - Training loss: 0.01393175316325417


Epoch 10/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 10/30 - Training loss: 0.0233291472223672


Epoch 11/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 11/30 - Training loss: 0.020818091960708527


Epoch 12/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 12/30 - Training loss: 0.04281103343942739


Epoch 13/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 13/30 - Training loss: 0.023400667966625682


Epoch 14/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 14/30 - Training loss: 0.018407123706613977


Epoch 15/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 15/30 - Training loss: 0.041174949132696245


Epoch 16/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 16/30 - Training loss: 0.01970097564238434


Epoch 17/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 17/30 - Training loss: 0.00407971565216554


Epoch 18/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 18/30 - Training loss: 0.006021793819716991


Epoch 19/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 19/30 - Training loss: 0.013299077020774625


Epoch 20/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 20/30 - Training loss: 0.019434632384218276


Epoch 21/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 21/30 - Training loss: 0.028604894033350953


Epoch 22/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 22/30 - Training loss: 0.016259735455440186


Epoch 23/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 23/30 - Training loss: 0.01760870157423514


Epoch 24/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 24/30 - Training loss: 0.002726288998294904


Epoch 25/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 25/30 - Training loss: 0.0016001095072803737


Epoch 26/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 26/30 - Training loss: 0.0013961246965636471


Epoch 27/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 27/30 - Training loss: 0.001547411817944411


Epoch 28/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 28/30 - Training loss: 0.001085918612344275


Epoch 29/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]


Epoch 29/30 - Training loss: 0.0010135490059231718


Epoch 30/30: 100%|██████████| 132/132 [01:26<00:00,  1.53batch/s]

Epoch 30/30 - Training loss: 0.001576722610590161





In [43]:
# 5. Save the model
model.save_pretrained("./language_model")

In [44]:
tokenizer.save_pretrained("./language_model")

('./language_model/tokenizer_config.json',
 './language_model/special_tokens_map.json',
 './language_model/vocab.txt',
 './language_model/added_tokens.json')

In [45]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the trained model and tokenizer
model_path = "./language_model"  # Path where the model is saved
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Function to predict language code
def predict_language(sentence):
    # Tokenize the input sentence
    encoding = tokenizer(sentence, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Move input tensors to the correct device
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Get predicted label
    predicted_label_id = torch.argmax(logits, dim=1).item()

    # Convert back to language code using label_map
    predicted_language = [lang for lang, idx in label_map.items() if idx == predicted_label_id][0]

    return predicted_language

# Predict for all test sentences
predicted_languages = [predict_language(sentence) for sentence in X_test]

# Create a DataFrame with actual and predicted language codes
output_df = pd.DataFrame({
    'Sentence': X_test.values,             # Original sentence
    'Actual Language Code': y_test.values,  # Original language label
    'Predicted Language Code': predicted_languages  # Predicted language code
})

# Save to CSV file
output_file = "predicted_language_codes.csv"
output_df.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


Predictions saved to predicted_language_codes.csv


In [27]:
data.shape

(6000, 2)

In [28]:
#Calculate accuracy score

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predicted_languages)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 91.56%


In [48]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Path to the saved model and tokenizer
model_pathe = '/content/language_model'
tokenizer_pathe= '/content/language_model'

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_pathe)
tokenizer = BertTokenizer.from_pretrained(tokenizer_pathe)

In [49]:
import torch.nn.functional as F

In [58]:
# df1=pd.read_csv("/content/testing data.csv")
# languages = ['te', 'kn', 'ml', 'ta', 'hi', 'mr', 'gu', 'bn', 'pa', 'ur', 'or', 'sd']

In [59]:


# # Create lists to store the predicted language codes and confidence scores
# predicted_labels = []
# confidence_scores = []

# # Loop through each sentence and perform inference
# for sentence in df1['Sentence']:
#     # Tokenize the sentence
#     inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=512)

#     # Perform inference (turn off gradients for inference)
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Get the logits (raw predictions)
#     logits = outputs.logits

#     # Apply softmax to get the probabilities (confidence scores)
#     probabilities = F.softmax(logits, dim=-1)

#     # Get the predicted class (index of max probability)
#     predicted_class = torch.argmax(logits, dim=-1)

#     # Get the confidence score (probability of the predicted class)
#     confidence_score = probabilities[0, predicted_class].item()

#     # Map the predicted class index to the corresponding language code
#     predicted_language = languages[predicted_class.item()]

#     # Append the predictions and confidence score to the lists
#     predicted_labels.append(predicted_language)
#     confidence_scores.append(confidence_score)

# # Add the predictions to the dataframe
# df1['predicted_language_code'] = predicted_labels
# df1['confidence_score'] = confidence_scores

# # Display the updated dataframe
# print(df1)


In [60]:
# df1.to_csv('testing_results.csv', index=False)


In [1]:
input_text= "Jadon vi kujh anakiasiya vaparda hai, hara koi anukula hona di koshish karada hai, apaniya yojanava nu anukula banaunda hai, nave hunara sikhada hai, ate apani yatra jari rakhada hai, iha vishwasa karade hoye ki tabadali nave mauke lia sakadi hai."
# Tokenize the input text

inputs = tokenizer(input_text, return_tensors="pt")

NameError: name 'tokenizer' is not defined

In [132]:
# Perform inference (turn off gradients for inference)
with torch.no_grad():
    outputs = model(**inputs)

# Get the logits (raw predictions)
logits = outputs.logits

# Apply softmax to get the probabilities (confidence scores)
probabilities = F.softmax(logits, dim=-1)

# If it's a classification task, you can get the predicted class
predicted_class = torch.argmax(logits, dim=-1)
# Get the confidence score (the probability of the predicted class)
confidence_score = probabilities[0, predicted_class].item()

# Print the predicted class (assuming you have class labels corresponding to languages)
# You will need to map the predicted class index to its corresponding language
languages = ['te', 'kn', 'ml', 'ta', 'hi', 'mr', 'gu', 'bn', 'pa', 'ur', 'or', 'sd']  # Example language labels
predicted_language = languages[predicted_class.item()]

print(f"Predicted language: {predicted_language}")
print(f"Confidence score: {confidence_score:.4f}")

Predicted language: pa
Confidence score: 0.9996
