In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from nltk.translate.meteor_score import meteor_score
import nltk
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer

# Ensure the required NLTK data is available
nltk.download('punkt')

def calculate_meteor(reference, candidate):
    ref_tokens = word_tokenize(reference.lower())
    cand_tokens = word_tokenize(candidate.lower())
    return meteor_score([ref_tokens], cand_tokens)

def filter_data_by_meteor(df, threshold=0.5):
    filtered_data = []
    for index, row in df.iterrows():
        try:
            # Convert all text fields to strings and handle NaN values
            original_positive = str(row['original_positive'])
            back_translate_positive = str(row['back_translate_positive'])
            original_negative = str(row['original_negative'])
            back_translate_negative = str(row['back_translate_negative'])
            
            # Calculate METEOR score for positive sentiment path
            meteor_score_positive = calculate_meteor(original_positive, back_translate_positive)
            
            # Calculate METEOR score for negative sentiment path
            meteor_score_negative = calculate_meteor(original_negative, back_translate_negative)

            # Ensure both paths meet the threshold
            if meteor_score_positive >= threshold and meteor_score_negative >= threshold:
                filtered_data.append(row)
        except Exception as e:
            print(f"Error processing row {index}: {e}")
    
    return pd.DataFrame(filtered_data)

def calculate_average_rouge(df, pos_col, neg_col):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1 = 0
    total_rouge2 = 0
    total_rougeL = 0
    total_rows = len(df)

    for index, row in df.iterrows():
        positive_sentiment = str(row[pos_col]).lower()
        negative_sentiment = str(row[neg_col]).lower()
        
        scores = scorer.score(positive_sentiment, negative_sentiment)
        
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    
    avg_rouge1 = total_rouge1 / total_rows if total_rows != 0 else 0
    avg_rouge2 = total_rouge2 / total_rows if total_rows != 0 else 0
    avg_rougeL = total_rougeL / total_rows if total_rows != 0 else 0
    
    return avg_rouge1, avg_rouge2, avg_rougeL, total_rows

# Load the datasets
df_amazon_0 = pd.read_csv('./dataset/amazon-0.csv')
df_amazon_1 = pd.read_csv('./dataset/amazon-1.csv')
df_yelp_0 = pd.read_csv('./dataset/yelp-0.csv')
df_yelp_1 = pd.read_csv('./dataset/yelp-1.csv')

# Filter Amazon data by METEOR score
threshold = 0.65
filtered_amazon_0 = filter_data_by_meteor(df_amazon_0, threshold)
filtered_amazon_1 = filter_data_by_meteor(df_amazon_1, threshold)

# Print the number of rows remaining after filtering for Amazon data
print(f"With threshold {threshold}, {len(filtered_amazon_0)} out of {len(df_amazon_0)} rows remaining from amazon-0")
print(f"With threshold {threshold}, {len(filtered_amazon_1)} out of {len(df_amazon_1)} rows remaining from amazon-1")

# Calculate average ROUGE scores for Amazon-1 and Yelp-1
avg_amazon_1_rouge1, avg_amazon_1_rouge2, avg_amazon_1_rougeL, amazon_1_count = calculate_average_rouge(filtered_amazon_1, 'positive_sentiment', 'negative_sentiment')
avg_yelp_1_rouge1, avg_yelp_1_rouge2, avg_yelp_1_rougeL, yelp_1_count = calculate_average_rouge(df_yelp_1, 'positive_sentiment', 'negative_sentiment')

# Calculate average ROUGE scores for Amazon-0 and Yelp-0
avg_amazon_0_rouge1, avg_amazon_0_rouge2, avg_amazon_0_rougeL, amazon_0_count = calculate_average_rouge(filtered_amazon_0, 'negative_sentiment', 'positive_sentiment')
avg_yelp_0_rouge1, avg_yelp_0_rouge2, avg_yelp_0_rougeL, yelp_0_count = calculate_average_rouge(df_yelp_0, 'negative_sentiment', 'positive_sentiment')

# Calculate overall averages considering the number of rows
total_rows = amazon_1_count + yelp_1_count + amazon_0_count + yelp_0_count

total_rouge1 = (
    (avg_amazon_1_rouge1 * amazon_1_count) +
    (avg_yelp_1_rouge1 * yelp_1_count) +
    (avg_amazon_0_rouge1 * amazon_0_count) +
    (avg_yelp_0_rouge1 * yelp_0_count)
) / total_rows

total_rouge2 = (
    (avg_amazon_1_rouge2 * amazon_1_count) +
    (avg_yelp_1_rouge2 * yelp_1_count) +
    (avg_amazon_0_rouge2 * amazon_0_count) +
    (avg_yelp_0_rouge2 * yelp_0_count)
) / total_rows

total_rougeL = (
    (avg_amazon_1_rougeL * amazon_1_count) +
    (avg_yelp_1_rougeL * yelp_1_count) +
    (avg_amazon_0_rougeL * amazon_0_count) +
    (avg_yelp_0_rougeL * yelp_0_count)
) / total_rows

# Print the results for inspection
# print("Amazon-1 Changes:")
# print(f"Average ROUGE-1: {avg_amazon_1_rouge1}")
# print(f"Average ROUGE-2: {avg_amazon_1_rouge2}")
# print(f"Average ROUGE-L: {avg_amazon_1_rougeL}")

# print("Yelp-1 Changes:")
# print(f"Average ROUGE-1: {avg_yelp_1_rouge1}")
# print(f"Average ROUGE-2: {avg_yelp_1_rouge2}")
# print(f"Average ROUGE-L: {avg_yelp_1_rougeL}")

# print("Amazon-0 Changes:")
# print(f"Average ROUGE-1: {avg_amazon_0_rouge1}")
# print(f"Average ROUGE-2: {avg_amazon_0_rouge2}")
# print(f"Average ROUGE-L: {avg_amazon_0_rougeL}")

# print("Yelp-0 Changes:")
# print(f"Average ROUGE-1: {avg_yelp_0_rouge1}")
# print(f"Average ROUGE-2: {avg_yelp_0_rouge2}")
# print(f"Average ROUGE-L: {avg_yelp_0_rougeL}")

print("Overall Changes:")
print(f"Average ROUGE-1: {total_rouge1}")
print(f"Average ROUGE-2: {total_rouge2}")
print(f"Average ROUGE-L: {total_rougeL}")

# Combine all datasets
combined_df = pd.concat([filtered_amazon_0, filtered_amazon_1, df_yelp_0, df_yelp_1])

# Shuffle the combined data
combined_df = combined_df.sample(frac=1, random_state=127).reset_index(drop=True)

# Split the data
ft_train_data, temp_data = train_test_split(combined_df, test_size=0.35, random_state=127)
ft_test_data, temp_data = train_test_split(temp_data, test_size=0.7142, random_state=127)
rl_data, rl_eval_data = train_test_split(temp_data, test_size=0.4, random_state=127)

# Print the number of rows per split
print(f"ft_train_data: {len(ft_train_data)*2} rows")
print(f"ft_test_data: {len(ft_test_data)*2} rows")
print(f"rl_data: {len(rl_data)*2} rows")
print(f"rl_eval_data: {len(rl_eval_data)*2} rows")

# Define function to create data pairs
def create_data_pairs(df):
    data_pairs = []
    for index, row in df.iterrows():
        json_obj = {
            'text': '### USER:\nUbah sentimen dari kalimat awal berikut menjadi sentimen sebaliknya\nKalimat Awal: {}\nKalimat Baru: \n\n### RESPONSE:\n{}<|endoftext|>'.format(row['negative_sentiment'], row['positive_sentiment'])
        }
        json_obj_2 = {
            'text': '### USER:\nUbah sentimen dari kalimat awal berikut menjadi sentimen sebaliknya\nKalimat Awal: {}\nKalimat Baru: \n\n### RESPONSE:\n{}<|endoftext|>'.format(row['positive_sentiment'], row['negative_sentiment'])
        }
        data_pairs.append(json_obj)
        data_pairs.append(json_obj_2)
    return data_pairs

# Create and save data pairs for each split
with open('./json/ft_train.json', 'w') as f:
    json.dump(create_data_pairs(ft_train_data), f, ensure_ascii=False, indent=4)

with open('./json/ft_test.json', 'w') as f:
    json.dump(create_data_pairs(ft_test_data), f, ensure_ascii=False, indent=4)

with open('./json/rl_data.json', 'w') as f:
    json.dump(create_data_pairs(rl_data), f, ensure_ascii=False, indent=4)

with open('./json/rl_eval.json', 'w') as f:
    json.dump(create_data_pairs(rl_eval_data), f, ensure_ascii=False, indent=4)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adzka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


With threshold 0.65, 306 out of 500 rows remaining from amazon-0
With threshold 0.65, 332 out of 500 rows remaining from amazon-1
Overall Changes:
Average ROUGE-1: 0.5974905369281439
Average ROUGE-2: 0.3955380846034492
Average ROUGE-L: 0.5856648696924207
ft_train_data: 2128 rows
ft_test_data: 328 rows
rl_data: 492 rows
rl_eval_data: 328 rows
