In [3]:
import pandas as pd
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

# Function to preprocess the English Input Text

In [4]:
import re

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()
    
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove non-printable characters
    text = re.sub(r'[^\x20-\x7E]', '', text)

    # Optional: Remove special symbols (e.g., HTML tags, currency symbols)
    text = re.sub(r'<.*?>', '', text)  # remove HTML
    text = re.sub(r'[^\w\s.,;:!?\'"-]', '', text)  # allow punctuation

    return text

In [5]:
df = pd.read_excel("./Data/insurance_policies.xlsx")
df["Policy_Text_EN_Clean"] = df["Policy_Text_EN"].apply(preprocess_text)
texts = df["Policy_Text_EN_Clean"].tolist()

# For complete translations, implement the method:
1. Splits long policy texts into sentences.
2. Translates each sentence individually (to avoid token overflow).
3. Reassembles the translated sentences into one coherent result per policy.

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

In [13]:
def translate_policy_mbart(policy_text, target_lang):
    tokenizer.src_lang = "en_XX"
    sentences = sent_tokenize(policy_text)
    translated_sentences = []

    for sent in sentences:
        encoded = tokenizer(sent, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        generated = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id[target_lang], max_length=512)
        translated = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
        translated_sentences.append(translated)

    return ' '.join(translated_sentences)

In [14]:
df["Policy_Text_FR_mBART"] = df["Policy_Text_EN_Clean"].apply(lambda x: translate_policy_mbart(x, "fr_XX"))
df["Policy_Text_ES_mBART"] = df["Policy_Text_EN_Clean"].apply(lambda x: translate_policy_mbart(x, "es_XX"))

In [15]:
df.to_excel("./Data/translated_insurance_policies.xlsx", index=False)

# Summarize Preprocessed Insurance Policy Text

In [43]:
# Load Excel File
#df = pd.read_excel("./Data/translated_insurance_policies.xlsx")

# Load summarization model (BART)
summarizer_name = "facebook/bart-large-cnn"
summarizer_tokenizer = BartTokenizer.from_pretrained(summarizer_name)
summarizer_model = BartForConditionalGeneration.from_pretrained(summarizer_name).to(device)

# Load translation model (mBART)
translator_name = "facebook/mbart-large-50-many-to-many-mmt"
translator_tokenizer = MBart50TokenizerFast.from_pretrained(translator_name)
translator_model = MBartForConditionalGeneration.from_pretrained(translator_name).to(device)

In [44]:
# Summarization function
def summarize_long_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    summary_ids = summarizer_model.generate(
        inputs["input_ids"],
        num_beams=4,
        max_length= 520,
        min_length=150,
        length_penalty=1.0,
        early_stopping=True
    )
    return summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [45]:
# Translation function
def translate(text, target_lang="fr_XX"):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    translator_tokenizer.src_lang = "en_XX"
    encoded = translator_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    generated_tokens = translator_model.generate(
        **encoded,
        forced_bos_token_id=translator_tokenizer.lang_code_to_id[target_lang],
        max_length=512
    )
    return translator_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [46]:
# Apply summarization and translations
summaries_en = []
summaries_fr = []
summaries_es = []

for text in df["Policy_Text_EN_Clean"]:
    summary = summarize_long_text(text)
    summaries_en.append(summary)
    summaries_fr.append(translate(summary, target_lang="fr_XX"))
    summaries_es.append(translate(summary, target_lang="es_XX"))

# Add columns to DataFrame
df["Policy_Summary_EN"] = summaries_en
df["Policy_Summary_FR"] = summaries_fr
df["Policy_Summary_ES"] = summaries_es

# Save to new Excel file
output_file = "translated_insurance_policies_with_summaries.xlsx"
df.to_excel(output_file, index=False)

# Sentence Alignment Analysis:

### Load Data

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel("C:/Users/Harish/Desktop/GUVI/FinalProject_2/Data/translated_insurance_policies_with_summaries.xlsx")

In [3]:
df.head(3)

Unnamed: 0,Policy_ID,Policy_Name,Policy_Text_EN,Policy_Text_EN_Clean,Policy_Text_FR_mBART,Policy_Text_ES_mBART,Policy_Summary_EN,Policy_Summary_FR,Policy_Summary_ES
0,P001,Bike Insurance,Bike insurance is the ultimate safety net for ...,bike insurance is the ultimate safety net for ...,L'assurance-bicyclette est le filet de sécurit...,la seguridad de la moto es la máxima seguridad...,Bike insurance provides coverage against natur...,L'assurance-bicyclette offre une protection co...,El seguro de bicicleta ofrece cobertura contra...
1,P002,Car Insurance,"Car insurance, also known as auto or motor ins...","car insurance, also known as auto or motor ins...","L'assurance automobile, également connue sous ...","el seguro de coche, también conocido como segu...",Car insurance is a financial safety net that p...,L'assurance automobile est un réseau de sécuri...,El seguro de coche es una red de seguridad fin...
2,P003,Health Insurance,"Health insurance, also known as medical insura...","health insurance, also known as medical insura...","L'assurance maladie, également connue sous le ...","seguro médico, también conocido como seguro mé...","health insurance, also known as medical insura...","assurance maladie, également connue sous le no...","seguro médico, también conocido como seguro mé..."


In [5]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states=True)

def get_sentence_embedding(text, tokenizer, model, pooling='mean'):
    """
    Get sentence-level embedding using different pooling strategies
    """
    inputs = tokenizer(
        text,
        return_tensors='pt',
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding=True
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state.squeeze(0)
        
        # Remove [CLS] and [SEP] tokens for mean pooling
        if pooling == 'mean':
            # Exclude [CLS] (first) and [SEP] (last) tokens
            token_embeddings = hidden_states[1:-1]
            sentence_embedding = torch.mean(token_embeddings, dim=0)
        elif pooling == 'cls':
            # Use [CLS] token embedding
            sentence_embedding = hidden_states[0]
        elif pooling == 'max':
            # Max pooling over token embeddings
            token_embeddings = hidden_states[1:-1]
            sentence_embedding = torch.max(token_embeddings, dim=0)[0]
    
    return sentence_embedding.numpy()

def calculate_similarity_scores(df, src_col='Policy_Text_EN_Clean', tgt_col='Policy_Text_FR_mBART'):
    """
    Calculate various similarity scores for each translation pair
    """
    similarity_results = []
    
    print("Calculating similarity scores for each translation...")
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        sent_src = str(row[src_col])
        sent_tgt = str(row[tgt_col])
        
        # Skip if either text is empty or NaN
        if pd.isna(sent_src) or pd.isna(sent_tgt) or sent_src.strip() == '' or sent_tgt.strip() == '':
            continue
        
        # Get sentence embeddings using different pooling methods
        src_emb_mean = get_sentence_embedding(sent_src, tokenizer, model, pooling='mean')
        tgt_emb_mean = get_sentence_embedding(sent_tgt, tokenizer, model, pooling='mean')
        
        src_emb_cls = get_sentence_embedding(sent_src, tokenizer, model, pooling='cls')
        tgt_emb_cls = get_sentence_embedding(sent_tgt, tokenizer, model, pooling='cls')
        
        # Calculate cosine similarities
        cos_sim_mean = cosine_similarity([src_emb_mean], [tgt_emb_mean])[0][0]
        cos_sim_cls = cosine_similarity([src_emb_cls], [tgt_emb_cls])[0][0]
        
        # Calculate token-level alignment score (your original approach)
        token_alignment_score = calculate_token_alignment_score(sent_src, sent_tgt, tokenizer, model)
        
        # Store results
        result = {
            'index': idx,
            'source_text': sent_src[:100] + '...' if len(sent_src) > 100 else sent_src,
            'target_text': sent_tgt[:100] + '...' if len(sent_tgt) > 100 else sent_tgt,
            'cosine_similarity_mean_pooling': cos_sim_mean,
            'cosine_similarity_cls_pooling': cos_sim_cls,
            'token_alignment_score': token_alignment_score,
            'source_length': len(sent_src.split()),
            'target_length': len(sent_tgt.split())
        }
        
        similarity_results.append(result)
    
    return pd.DataFrame(similarity_results)

def calculate_token_alignment_score(sent_src, sent_tgt, tokenizer, model):
    """
    Calculate token-level alignment score (average of best matches)
    """
    try:
        # Tokenize separately
        src_inputs = tokenizer(
            sent_src,
            return_tensors='pt',
            add_special_tokens=True,
            max_length=512,
            truncation=True
        )
        
        tgt_inputs = tokenizer(
            sent_tgt,
            return_tensors='pt',
            add_special_tokens=True,
            max_length=512,
            truncation=True
        )
        
        # Get embeddings
        with torch.no_grad():
            src_outputs = model(**src_inputs)
            tgt_outputs = model(**tgt_inputs)
        
        # Get token embeddings and remove [CLS] and [SEP]
        src_emb = src_outputs.last_hidden_state.squeeze(0)[1:-1]
        tgt_emb = tgt_outputs.last_hidden_state.squeeze(0)[1:-1]
        
        if src_emb.size(0) == 0 or tgt_emb.size(0) == 0:
            return 0.0
        
        # Compute cosine similarity matrix
        similarity_matrix = torch.nn.functional.cosine_similarity(
            src_emb.unsqueeze(1),
            tgt_emb.unsqueeze(0),
            dim=-1
        )
        
        # Get best alignment score for each source token
        best_scores = torch.max(similarity_matrix, dim=1)[0]
        avg_alignment_score = torch.mean(best_scores).item()
        
        return avg_alignment_score
        
    except Exception as e:
        print(f"Error calculating token alignment: {e}")
        return 0.0

def analyze_similarity_results(results_df):
    """
    Analyze and visualize similarity results
    """
    print("\n" + "="*50)
    print("TRANSLATION SIMILARITY ANALYSIS RESULTS")
    print("="*50)
    
    # Basic statistics
    print(f"\nTotal translations analyzed: {len(results_df)}")
    print(f"\nSimilarity Score Statistics:")
    print(f"{'Metric':<30} {'Mean':<8} {'Std':<8} {'Min':<8} {'Max':<8}")
    print("-" * 62)
    
    metrics = ['cosine_similarity_mean_pooling', 'cosine_similarity_cls_pooling', 'token_alignment_score']
    for metric in metrics:
        mean_val = results_df[metric].mean()
        std_val = results_df[metric].std()
        min_val = results_df[metric].min()
        max_val = results_df[metric].max()
        print(f"{metric:<30} {mean_val:<8.3f} {std_val:<8.3f} {min_val:<8.3f} {max_val:<8.3f}")
    
    # Quality assessment
    print(f"\nQuality Assessment (based on mean pooling cosine similarity):")
    high_quality = len(results_df[results_df['cosine_similarity_mean_pooling'] > 0.8])
    medium_quality = len(results_df[(results_df['cosine_similarity_mean_pooling'] > 0.6) & 
                                   (results_df['cosine_similarity_mean_pooling'] <= 0.8)])
    low_quality = len(results_df[results_df['cosine_similarity_mean_pooling'] <= 0.6])
    
    print(f"High quality translations (>0.8): {high_quality} ({high_quality/len(results_df)*100:.1f}%)")
    print(f"Medium quality translations (0.6-0.8): {medium_quality} ({medium_quality/len(results_df)*100:.1f}%)")
    print(f"Low quality translations (≤0.6): {low_quality} ({low_quality/len(results_df)*100:.1f}%)")
    
    # Show best examples
    print(f"\n" + "="*50)
    print("BEST TRANSLATIONS (Top 5)")
    print("="*50)
    best_translations = results_df.nlargest(5, 'cosine_similarity_mean_pooling')
    for _, row in best_translations.iterrows():
        print(f"\nSimilarity Score: {row['cosine_similarity_mean_pooling']:.3f}")
        print(f"Source: {row['source_text']}")
        print(f"Target: {row['target_text']}")
        print("-" * 40)
    
    return results_df

# Main execution
if __name__ == "__main__":

    print("Starting translation similarity analysis...")
    
    # Calculate similarity scores for all translations
    similarity_results = calculate_similarity_scores(df)
    
    # Analyze results
    analyzed_results = analyze_similarity_results(similarity_results)
    
    # Display sample results
    print(f"\nSample Results:")
    print(similarity_results[['index', 'cosine_similarity_mean_pooling', 
                             'cosine_similarity_cls_pooling', 'token_alignment_score']].head(10))

Starting translation similarity analysis...
Calculating similarity scores for each translation...


100%|██████████| 8/8 [00:51<00:00,  6.46s/it]


TRANSLATION SIMILARITY ANALYSIS RESULTS

Total translations analyzed: 8

Similarity Score Statistics:
Metric                         Mean     Std      Min      Max     
--------------------------------------------------------------
cosine_similarity_mean_pooling 0.836    0.048    0.719    0.862   
cosine_similarity_cls_pooling  0.901    0.103    0.648    0.954   
token_alignment_score          0.597    0.037    0.524    0.644   

Quality Assessment (based on mean pooling cosine similarity):
High quality translations (>0.8): 7 (87.5%)
Medium quality translations (0.6-0.8): 1 (12.5%)
Low quality translations (≤0.6): 0 (0.0%)

BEST TRANSLATIONS (Top 5)

Similarity Score: 0.862
Source: bike insurance is the ultimate safety net for your two-wheeler. be it a simple moped or a superbike,...
Target: L'assurance-bicyclette est le filet de sécurité ultime pour votre bicyclette. qu'il s'agisse d'un si...
----------------------------------------

Similarity Score: 0.858
Source: home insurance is 




# Model Fine-Tuning

### Train Hugging Face Transformer model mBART using this dataset.

### Train and Save French Translator model mBART

In [20]:
from transformers import MBartForConditionalGeneration, MBartTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Step 1: Load data
df = pd.read_excel("C:/Users/Harish/Desktop/GUVI/FinalProject_2/Data/translated_insurance_policies_with_summaries.xlsx")
df = df[['Policy_Text_EN_Clean', 'Policy_Text_FR_mBART']]
df.columns = ['src_text', 'tgt_text']
dataset = Dataset.from_pandas(df)

# Step 2: Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBartTokenizer.from_pretrained(model_name)
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "fr_XX"

# Step 3: Preprocess the data
def preprocess(data):
    return tokenizer(
        data['src_text'],
        text_target=data['tgt_text'],
        max_length=512,
        padding="max_length",
        truncation=True
    )

tokenized_dataset = dataset.map(preprocess, batched=True)

# Step 4: Load model
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Step 5: Define training args
training_args = TrainingArguments(
    output_dir="./mbart-finetuned",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_dir="./logs",
    save_strategy="no",  # no intermediate checkpoints
    report_to="none"
)

# Step 6: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Step 7: Train
trainer.train()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Map: 100%|██████████| 8/8 [00:00<00:00, 57.75 examples/s]
  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=4, training_loss=2.76182222366333, metrics={'train_runtime': 474.9411, 'train_samples_per_second': 0.017, 'train_steps_per_second': 0.008, 'total_flos': 8668519071744.0, 'train_loss': 2.76182222366333, 'epoch': 1.0})

In [21]:
save_path = "C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)



('C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/tokenizer_config.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/special_tokens_map.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/sentencepiece.bpe.model',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/added_tokens.json')

### Train and Save Spanish Translator model - mBART

In [22]:
# Step 1: Load data
df = pd.read_excel("C:/Users/Harish/Desktop/GUVI/FinalProject_2/Data/translated_insurance_policies_with_summaries.xlsx")
df = df[['Policy_Text_EN_Clean', 'Policy_Text_ES_mBART']]
df.columns = ['src_text', 'tgt_text']
dataset = Dataset.from_pandas(df)

# Step 2: Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBartTokenizer.from_pretrained(model_name)
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "es_XX"

# Step 3: Preprocess the data
def preprocess(data):
    return tokenizer(
        data['src_text'],
        text_target=data['tgt_text'],
        max_length=512,
        padding="max_length",
        truncation=True
    )

tokenized_dataset = dataset.map(preprocess, batched=True)

# Step 4: Load model
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Step 5: Define training args
training_args = TrainingArguments(
    output_dir="./mbart-finetuned",
    num_train_epochs=1,
    logging_dir="./logs",
    save_strategy="no",  # no intermediate checkpoints
    report_to="none"
)

# Step 6: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Step 7: Train
trainer.train()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Map: 100%|██████████| 8/8 [00:00<00:00, 38.08 examples/s]
  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=1, training_loss=5.286947250366211, metrics={'train_runtime': 338.9393, 'train_samples_per_second': 0.024, 'train_steps_per_second': 0.003, 'total_flos': 8668519071744.0, 'train_loss': 5.286947250366211, 'epoch': 1.0})

In [23]:
save_path = "C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)



('C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/tokenizer_config.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/special_tokens_map.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/sentencepiece.bpe.model',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/added_tokens.json')

### Train and Save English Language Summarizer

In [25]:
df = pd.read_excel("C:/Users/Harish/Desktop/GUVI/FinalProject_2/Data/translated_insurance_policies_with_summaries.xlsx")
df.head(1)

Unnamed: 0,Policy_ID,Policy_Name,Policy_Text_EN,Policy_Text_EN_Clean,Policy_Text_FR_mBART,Policy_Text_ES_mBART,Policy_Summary_EN,Policy_Summary_FR,Policy_Summary_ES
0,P001,Bike Insurance,Bike insurance is the ultimate safety net for ...,bike insurance is the ultimate safety net for ...,L'assurance-bicyclette est le filet de sécurit...,la seguridad de la moto es la máxima seguridad...,Bike insurance provides coverage against natur...,L'assurance-bicyclette offre une protection co...,El seguro de bicicleta ofrece cobertura contra...


In [26]:
df = df[['Policy_Text_EN_Clean', 'Policy_Summary_EN']]
df.columns = ['src_text', 'tgt_text']
dataset = Dataset.from_pandas(df)

In [27]:
# Load the model
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [28]:
# Tokenize the data
def preprocess(data):
    return tokenizer(
        data['src_text'],
        text_target=data['tgt_text'],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [29]:
tokenized_dataset = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 8/8 [00:00<00:00, 30.50 examples/s]


In [30]:
# Define training args
training_args = TrainingArguments(
    output_dir="./bart-summary-model",
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="no",
    report_to="none"
)

# Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save model and tokenizer for Streamlit app
save_path = "C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/summarizer/"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

  trainer = Trainer(


Step,Training Loss




('C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/summarizer/tokenizer_config.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/summarizer/special_tokens_map.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/summarizer/vocab.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/summarizer/merges.txt',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/summarizer/added_tokens.json')

### Train and Save French Translator model - Helsinki-NLP

In [31]:
from transformers import MarianMTModel, MarianTokenizer

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [32]:
from tqdm import tqdm
df = pd.read_excel("C:/Users/Harish/Desktop/GUVI/FinalProject_2/Data/translated_insurance_policies_with_summaries.xlsx")
df.head(1)

Unnamed: 0,Policy_ID,Policy_Name,Policy_Text_EN,Policy_Text_EN_Clean,Policy_Text_FR_mBART,Policy_Text_ES_mBART,Policy_Summary_EN,Policy_Summary_FR,Policy_Summary_ES
0,P001,Bike Insurance,Bike insurance is the ultimate safety net for ...,bike insurance is the ultimate safety net for ...,L'assurance-bicyclette est le filet de sécurit...,la seguridad de la moto es la máxima seguridad...,Bike insurance provides coverage against natur...,L'assurance-bicyclette offre une protection co...,El seguro de bicicleta ofrece cobertura contra...


In [33]:
# Use only the columns for training translation
df = df[["Policy_Text_EN_Clean", "Policy_Text_FR_mBART"]]
df = df.rename(columns={
    "Policy_Text_EN_Clean": "src_text",
    "Policy_Text_FR_mBART": "tgt_text"
})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [34]:
# Preprocess function
def preprocess_function(examples):
    inputs = tokenizer(examples["src_text"], max_length=256, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(examples["tgt_text"], max_length=256, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 8/8 [00:00<00:00, 39.17 examples/s]


In [36]:
training_args = TrainingArguments(
    output_dir="./marian-en-fr-finetuned",
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


Step,Training Loss




TrainOutput(global_step=3, training_loss=0.8785576820373535, metrics={'train_runtime': 38.5505, 'train_samples_per_second': 0.623, 'train_steps_per_second': 0.078, 'total_flos': 1627121516544.0, 'train_loss': 0.8785576820373535, 'epoch': 3.0})

In [37]:
# Save model and tokenizer 
save_path = "C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/tokenizer_config.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/special_tokens_map.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/vocab.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/source.spm',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/target.spm',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/frenchtranslator/added_tokens.json')

### Train and Save the Spanish Translator - Helsinki-NLP

In [42]:
df = pd.read_excel("C:/Users/Harish/Desktop/GUVI/FinalProject_2/Data/translated_insurance_policies_with_summaries.xlsx")
df.head(1)

Unnamed: 0,Policy_ID,Policy_Name,Policy_Text_EN,Policy_Text_EN_Clean,Policy_Text_FR_mBART,Policy_Text_ES_mBART,Policy_Summary_EN,Policy_Summary_FR,Policy_Summary_ES
0,P001,Bike Insurance,Bike insurance is the ultimate safety net for ...,bike insurance is the ultimate safety net for ...,L'assurance-bicyclette est le filet de sécurit...,la seguridad de la moto es la máxima seguridad...,Bike insurance provides coverage against natur...,L'assurance-bicyclette offre une protection co...,El seguro de bicicleta ofrece cobertura contra...


In [39]:
from transformers import MarianMTModel, MarianTokenizer

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [43]:
# Use only the columns for training translation
df = df[["Policy_Text_EN_Clean", "Policy_Text_ES_mBART"]]
df = df.rename(columns={
    "Policy_Text_EN_Clean": "src_text",
    "Policy_Text_ES_mBART": "tgt_text"
})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Preprocess function
def preprocess_function(data):
    inputs = tokenizer(data["src_text"], max_length=256, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(data["tgt_text"], max_length=256, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./marian-en-es-finetuned",
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

Map: 100%|██████████| 8/8 [00:00<00:00, 98.45 examples/s]
  trainer = Trainer(


Step,Training Loss




TrainOutput(global_step=3, training_loss=0.7281883557637533, metrics={'train_runtime': 35.1041, 'train_samples_per_second': 0.684, 'train_steps_per_second': 0.085, 'total_flos': 1627121516544.0, 'train_loss': 0.7281883557637533, 'epoch': 3.0})

In [44]:
# Save model and tokenizer 
save_path = "C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/tokenizer_config.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/special_tokens_map.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/vocab.json',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/source.spm',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/target.spm',
 'C:/Users/Harish/Desktop/GUVI/FinalProject_2/artifacts/spanishtranslator/added_tokens.json')