In [None]:
import pandas as pd
import time
import matplotlib.pyplot as plt
from scipy import stats
from nltk.metrics.scores import f_measure
import numpy as np
import torchmetrics as torchm
import nltk 
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# Importing necessary libraries
wer = torchm.WordErrorRate()  # Initializing WordErrorRate object for word error rate computation
cer = torchm.CharErrorRate()  # Initializing CharErrorRate object for character error rate computation
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')  # Initializing a RegexpTokenizer object to tokenize text based on word boundaries

# Function definition to create sets of unique words from reference and prediction strings
def sets(refs, preds):
    refs_set = set()  # Set to store unique words from reference string
    preds_set = set()  # Set to store unique words from prediction string
    
    # Tokenizing reference string and converting to lowercase
    r_words = tokenizer.tokenize(refs.lower())  
    # Iterating through each word in reference string
    for w_r in r_words:
        # Adding each word to the set if not already present
        if w_r not in refs_set:
            refs_set.add(w_r)  
    
    # Tokenizing prediction string and converting to lowercase
    p_words = tokenizer.tokenize(preds.lower())  
    # Iterating through each word in prediction string
    for w_p in p_words:
        # Adding each word to the set if not already present
        if w_p not in preds_set:
            preds_set.add(w_p)  
    
    # Returning sets of unique words from reference and prediction strings
    return refs_set, preds_set


In [None]:
# Reading the translation training data from user input and storing it in a DataFrame
# The data is assumed to be in tab-separated format, hence the separator '\t'
df = pd.read_csv(input('Introduce the translation training data: '), sep='\t')

# Selecting specific columns from the DataFrame for further processing
# Assuming 'source' column contains original text, 'translation' column contains translated text,
# and 'evaluator1 scores' and 'evaluator2 scores' contain scores given by two different evaluators
df = df[['source', 'translation', 'evaluator1 scores', 'evaluator2 scores']]

In [None]:
# Initializing empty lists to store Word Error Rate (WER), Character Error Rate (CER), and F-measure scores
wer_list, cer_list, f_list = [], [], []

# Initializing a counter variable
n = 0

# Iterating through each pair of source and translation texts in the DataFrame
for src, tgt in zip(df['source'], df['translation']):
    # Start measuring time
    a = time.time()
    
    # Printing the lengths of source and target texts
    print(len(src), len(tgt))
    
    # Computing the Word Error Rate (WER) score between source and target texts and converting to percentage
    wer_score = wer(target=src, preds=tgt).item() * 100
    # Appending the WER score to the WER list
    wer_list.append(wer_score)
    
    # Computing the Character Error Rate (CER) score between source and target texts and converting to percentage
    cer_score = cer(target=src, preds=tgt).item() * 100
    # Appending the CER score to the CER list
    cer_list.append(cer_score)
    
    # Creating sets of unique words from source and target texts
    r_s, p_s = sets(src, tgt)
    # Computing the F-measure score between the sets of unique words from source and target texts and converting to percentage
    f_score = f_measure(r_s, p_s) * 100
    # Appending the F-measure score to the F-measure list
    f_list.append(f_score)
    
    # Incrementing the counter variable
    n += 1
    
    # Printing the time taken for processing the current pair of texts
    print(time.time() - a)


In [None]:
# Add scores to the data frame
df['wer'] = wer_list
df['cer'] = cer_list
df['f_score'] = f_list

In [None]:
# Selecting columns 'wer', 'cer', and 'f_score' from the DataFrame and assigning them to variable x
x = df[['wer', 'cer', 'f_score']]

# Selecting 'evaluator scores' column from the DataFrame and assigning it to variable y
y = df['evaluator scores']

# Initializing a linear regression model
regr = linear_model.LinearRegression()

# Transforming the features using PolynomialFeatures to create quadratic features without including bias
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

# Fitting the transformed features and the target values to the linear regression model
regr.fit(x_, y)

# Predicting evaluator scores for translation test data provided by the user
predictions = regr.predict(input('Introduce the translation test data: '))

# Computing Pearson correlation coefficient between the actual evaluator scores and predicted scores
pearson_corr = stats.pearsonr(df['evaluator scores'], predictions)[0]

# Printing the Pearson correlation coefficient
print('Pearson corr: ', pearson_corr)
