In [24]:
!pip install transformers



In [28]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 992.0/992.0 kB 5.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [1]:
import pandas as pd
import numpy as numpy

In [2]:
# Load dataset
df = pd.read_csv(r"data\data_trustpilot.csv") 
df.head()

Unnamed: 0,rating,location,username,number_reviews,verification,repeat_reviewer,repeat_reviewer_encoded,company,text,text_processed,...,date_posted,local_date_posted,month_local,local_hour,time_of_day,day_of_week_posted,day_type,days_between_experience_and_post,review_time,review_time_encoded
0,5,CA,Rob Crane,2,Redirected,repeat,1,Flashbay,The company rep I worked with made my transact...,company rep worked made transaction smooth qui...,...,2024-10-23 04:17:44,2024-10-22,10,21,Evening,1,Business Day,129,late_review,0
1,5,US,Pat Anderson,1,Verified,one-time,0,Flashbay,I highly recommend using Flashbay. Immediately...,highly recommend using flashbay immediately or...,...,2024-10-16 19:34:05,2024-10-16,10,12,Business Hours,2,Business Day,0,quick_review,1
2,5,CZ,Margarita Orlova,1,Verified,one-time,0,Flashbay,I had the pleasure of working with Shelby Gibs...,pleasure working shelby gibson large order nee...,...,2024-10-17 10:27:44,2024-10-17,10,10,Business Hours,3,Business Day,7,late_review,0
3,5,US,Paola Rivas,1,Verified,one-time,0,Flashbay,I had a fantastic experience with Brian Truong...,fantastic experience brian truong attentive tr...,...,2024-10-21 22:38:50,2024-10-21,10,15,Business Hours,0,Business Day,0,quick_review,1
4,5,CA,Fiona Mckelvey Keenan,3,Not Verified,repeat,1,Flashbay,My number-one go-to for computer accessories. ...,numberone goto computer accessories rachel sup...,...,2024-10-23 04:09:05,2024-10-22,10,21,Evening,1,Business Day,103,late_review,0


In [3]:
df = df[["rating", "text", "text_processed"]]
df.head()

Unnamed: 0,rating,text,text_processed
0,5,The company rep I worked with made my transact...,company rep worked made transaction smooth qui...
1,5,I highly recommend using Flashbay. Immediately...,highly recommend using flashbay immediately or...
2,5,I had the pleasure of working with Shelby Gibs...,pleasure working shelby gibson large order nee...
3,5,I had a fantastic experience with Brian Truong...,fantastic experience brian truong attentive tr...
4,5,My number-one go-to for computer accessories. ...,numberone goto computer accessories rachel sup...


In [4]:
df["rating"].value_counts()

rating
5    31372
1    16152
4     6290
3     6272
2     4324
Name: count, dtype: int64

In [4]:
# Separate the classes based on ratings
rating_1 = df[df['rating'] == 1]
rating_2 = df[df['rating'] == 2]
rating_3 = df[df['rating'] == 3]
rating_4 = df[df['rating'] == 4]
rating_5 = df[df['rating'] == 5]

# Downsample by factor 1000
rating_1_downsampled = rating_1.sample(n=32, random_state=42)
rating_2_downsampled = rating_2.sample(n=4, random_state=42)
rating_3_downsampled = rating_3.sample(n=6, random_state=42)
rating_4_downsampled = rating_4.sample(n=6, random_state=42)
rating_5_downsampled = rating_5.sample(n=16, random_state=42)

# Combine back the dataset
df_mini = pd.concat([rating_1_downsampled, rating_2_downsampled, rating_3_downsampled, rating_4_downsampled, rating_5_downsampled])

# Shuffle the dataset
df_mini = df_mini.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new class distribution
print(df_mini['rating'].value_counts())

rating
1    32
5    16
4     6
3     6
2     4
Name: count, dtype: int64


In [None]:
# random RandomOverSampler

from imblearn.over_sampling import RandomOverSampler

# Separate the features and target
X = df_mini[['text', 'text_processed']]  
y = df_mini['rating']  

# RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Combine back into a DataFrame
df_balanced = pd.concat([X_resampled, y_resampled], axis=1)

In [12]:
print(df_balanced['rating'].value_counts())

rating
5    32
1    32
4    32
3    32
2    32
Name: count, dtype: int64


In [14]:
df_balanced[df_balanced['rating'] == 2].head(20)

Unnamed: 0,text,text_processed,rating
14,Still continue to have issues with syncing.,still continue issues syncing,2
21,"Support was very kind, but steel series polici...",support kind steel series policies replacement...,2
38,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
50,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...,2
64,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
65,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...,2
66,Still continue to have issues with syncing.,still continue issues syncing,2
67,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
68,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
69,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...,2


In [7]:
# manual oversampling with lood

majority_size = df_mini['rating'].value_counts().max()

# Separate by class
balanced_df = pd.DataFrame()  
for rating in df_mini['rating'].unique():
    class_df = df_mini[df_mini['rating'] == rating]
    upsampled_class = class_df.sample(n=majority_size, replace=True, random_state=42)
    balanced_df = pd.concat([balanced_df, upsampled_class])

balanced_df = balanced_df.sample(frac=1).reset_index(drop=True) 
print(df_balanced['rating'].value_counts())

NameError: name 'df_balanced' is not defined

In [16]:
df_balanced[df_balanced['rating'] == 2].head(20)

Unnamed: 0,text,text_processed,rating
14,Still continue to have issues with syncing.,still continue issues syncing,2
21,"Support was very kind, but steel series polici...",support kind steel series policies replacement...,2
38,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
50,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...,2
64,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
65,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...,2
66,Still continue to have issues with syncing.,still continue issues syncing,2
67,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
68,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,2
69,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...,2


In [9]:
#combining with synonyme
from nltk.corpus import wordnet
import random

import nltk
nltk.download('wordnet')

def augment_text(text):
    words = text.split()
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms and random.random() > 0.7:  # 30% chance of replacement
            new_word = synonyms[0].lemmas()[0].name()
            new_words.append(new_word)
        else:
            new_words.append(word)
    return ' '.join(new_words)

# Augment minority classes
balanced_df = pd.DataFrame()
for rating in df_mini['rating'].unique():
    class_df = df_mini[df_mini['rating'] == rating]
    majority_size = df_mini['rating'].value_counts().max()
    augmented = class_df.copy()
    augmented['text'] = augmented['text'].apply(augment_text)
    upsampled_class = pd.concat([class_df, augmented]).sample(n=majority_size, replace=True)
    balanced_df = pd.concat([balanced_df, upsampled_class])
    
print(balanced_df['rating'].value_counts())

rating
5    32
1    32
4    32
3    32
2    32
Name: count, dtype: int64


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
balanced_df[balanced_df['rating'] == 2].head(30)

Unnamed: 0,rating,text,text_processed
38,2,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...
14,2,Still continue to have issues with syncing.,still continue issues syncing
14,2,Still continue to have issues with syncing.,still continue issues syncing
38,2,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...
50,2,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...
50,2,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...
14,2,Still continue to have issues with syncing.,still continue issues syncing
21,2,"Support was very kind, but steel series polici...",support kind steel series policies replacement...
14,2,Still continue to have issues with syncing.,still continue issues syncing
14,2,Still continue to have issues with syncing.,still continue issues syncing


In [11]:
# paraphrasing with T5

from transformers import T5ForConditionalGeneration, T5Tokenizer
import sentencepiece

# Load the T5 model and tokenizer
model_name = "t5-small"  
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def paraphrase_text(text, max_length=128, num_return_sequences=1):
    # Prepare the text for paraphrasing
    input_text = f"paraphrase: {text} </s>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

    # Generate paraphrased output
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=120,
        top_p=0.95,
        early_stopping=True,
    )

    # Decode the generated texts
    paraphrased_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return paraphrased_texts

df_mini['Paraphrased_Text'] = df_mini['text'].apply(paraphrase_text)
df_mini[df_mini['rating'] == 2].head(30)


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Unnamed: 0,rating,text,text_processed,Paraphrased_Text
14,2,Still continue to have issues with syncing.,still continue issues syncing,"[Paraphrase: I hear a few minutes later, but a..."
21,2,"Support was very kind, but steel series polici...",support kind steel series policies replacement...,"[: Support was very kind, but supporting was v..."
38,2,I’m tired of Quicken going up every year. I’ve...,im tired quicken going every year ive using pr...,[phrase: I’m tired of Quicken going up every y...
50,2,I accidentally ordered the wrong item. I reali...,accidentally ordered wrong item realized immed...,[phrase: I accidentally ordered the wrong item...


In [12]:
df_mini_2 = df_mini[df_mini['rating'] == 2]
df_mini_2.head()
if 'Paraphrased_Text' in df_mini_2.columns:
    for index, text in df_mini_2['text'].items():
        print("Original Text: ", text)
        for i, paraphrase in enumerate(df_mini_2.at[index, 'Paraphrased_Text'], 1):
                print("Paraphrased Text: ", paraphrase)
                print("\n")
        
 
          

Original Text:  Still continue to have issues with syncing.
Paraphrased Text:  Paraphrase: I hear a few minutes later, but a loud, simple, rhythmic voice could change.


Original Text:  Support was very kind, but steel series policies for replacement are harsh and ecologically unacceptable: either you have to destroy the device yourself or SteelSeries will destroy it - no repairs. Unless SteelSeries changes to become more sustainable and environmentally friendly or will be forced to change by an EU-law which will hopefully come soon, SteelSeries lost at least one customer. I wish I researched about those replacement policies sooner - there are a lot of confused customers with malfunctioning products they had to destroy themselves.
Paraphrased Text:  : Support was very kind, but supporting was very kind, but steel series policies are harsh and ecologically unacceptable. either you have to destroy the device yourself or SteelSeries will destroy it - no repairs.


Original Text:  I’m tire

In [None]:
#paraphrasing with pegasus
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import sentencepiece

# Load the Pegasus model and tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

def paraphrase_text_pegasus(text, max_length=500, num_return_sequences=1):
    # Tokenize input
    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True)

    # Generate paraphrased output
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=120,
        top_p=0.95,
        early_stopping=True,
    )

    # Decode the generated texts
    paraphrased_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return paraphrased_texts

df_mini['Paraphrased_Text_P'] = df_mini['text'].apply(paraphrase_text_pegasus)
df_mini[df_mini['rating'] == 2].head()



  from .autonotebook import tqdm as notebook_tqdm


In [8]:
df_mini_3 = df_mini[df_mini['rating'] == 2]
df_mini_3.head()
if 'Paraphrased_Text_P' in df_mini_3.columns:
    for index, text in df_mini_3['text'].items():
        print("Original Text: ", text)
        for i, paraphrase in enumerate(df_mini_3.at[index, 'Paraphrased_Text_P'], 1):
                print("Paraphrased Text: ", paraphrase)
                print("\n")

Original Text:  Still continue to have issues with syncing.
Paraphrased Text:  I've been working on a solution to make it easier to sync files between Mac and iOS devices.


Original Text:  Support was very kind, but steel series policies for replacement are harsh and ecologically unacceptable: either you have to destroy the device yourself or SteelSeries will destroy it - no repairs. Unless SteelSeries changes to become more sustainable and environmentally friendly or will be forced to change by an EU-law which will hopefully come soon, SteelSeries lost at least one customer. I wish I researched about those replacement policies sooner - there are a lot of confused customers with malfunctioning products they had to destroy themselves.
Paraphrased Text:  A few months ago I had to destroy a mobile phone I had bought from SteelSeries because it had a faulty battery.


Original Text:  I’m tired of Quicken going up every year. I’ve been using this product for probably 30 years. I only use i

In [13]:
!pip install openai


Collecting openai
  Downloading openai-1.55.1-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.0-cp312-none-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.2-py3-none-any.whl.metadata (170 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.1 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.27.1-cp312-none-win_amd64.whl.metadata (6.7 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading openai-1.55.1-py3-none-any.whl (389 kB)
Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.8.0-cp312-none-win_amd64.whl (206 kB)
Downloading pydantic-2.10.2-

In [None]:
import openai

openai.api_kye = xxx

def generate_paraphrases(text):
    
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that creates three unique paraphrases of a given text."},
                {"role": "user", "content": f"Please create 3 paraphrases of the following text: \"{text}\""}
            ]
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"Error: {e}"

# Iterate through the dataframe and generate paraphrases for each text
for index, row in df_mini.iterrows():
    original_text = row['text']
    paraphrased_text = generate_paraphrases(original_text)
    
    print(f"Original Text: {original_text}")
    print(f"Paraphrased Texts:\n{paraphrased_text}")
    
    


In [None]:
# example output Chat GPT:
# Original Text: Still continue to have issues with syncing.
# Paraphrased Texts: 
# Still experiencing problems with synchronization.
# Continuing to face syncing issues.
# Ongoing issues with syncing persist.