# 1. Read Dataset

In [None]:
import pandas as pd
import os

def process_row(row):
    """
    Processes a single row from the CSV file. If the row has more than 4 elements, it keeps the first, second, and last elements,
    and merges the middle elements into a single string.

    Parameters:
    - row (list of str): The row to be processed, represented as a list of string elements.

    Returns:
    - list of str: A processed row with the first, second, and last elements, and a merged string of any middle elements.
      If the original row had 4 or fewer elements, it is returned unchanged.
    """
    if len(row) > 4:
        first, second, *middle, last = row
        merged_middle = ' '.join(middle)
        return [first, second, merged_middle, last]
    return row

def create_dataframe_from_csv(file_path):
    """
    Reads a CSV file from the given file path and creates a pandas DataFrame. Each row in the CSV is processed to merge
    middle elements if the row contains more than 4 elements.

    Parameters:
    - file_path (str): The path to the CSV file to be read.

    Returns:
    - pd.DataFrame: A pandas DataFrame created from the processed CSV data. The first row of the CSV is used as column headers.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            row = [item.strip().strip('"') for item in line.split(',')]
            processed_row = process_row(row)
            data.append(processed_row)

    headers = data[0]
    df_data = data[1:]
    df = pd.DataFrame(df_data, columns=headers)
    return df

def preprocess_dataframe(df):
    """
    Preprocesses the given DataFrame by dropping the 'id' column and removing duplicates based on 'app' and 'content' columns.

    Parameters:
    - df (pd.DataFrame): The input DataFrame to preprocess.

    Returns:
    - pd.DataFrame: The preprocessed DataFrame, with the 'id' column dropped and duplicates removed.
    """
    df.drop(columns=['id'], inplace=True)
    df.drop_duplicates(subset=['app', 'content'], inplace=True)
    return df


base_dir = '../dataset/phase 1/'  
file_path_1 = os.path.join(base_dir, 'google_app_reviews_1.csv')
file_path_2 = os.path.join(base_dir, 'google_app_reviews_2.csv')

# Creating DataFrames from CSV files
df_1 = create_dataframe_from_csv(file_path_1)
df_2 = create_dataframe_from_csv(file_path_2)

# Preprocessing DataFrames
df_1 = preprocess_dataframe(df_1)
df_2 = preprocess_dataframe(df_2)

# Concatenating preprocessed DataFrames
df = pd.concat([df_1, df_2], ignore_index=True).reset_index(drop=True)


# 2. User Review Translation
## 2.1. Google Play Store

In [74]:
import pandas as pd
import time
import warnings
from transformers import MarianMTModel, MarianTokenizer

def translate_content_and_save(df, app_names, model_name, output_path, batch_size=10):
    """
    Translates the content of specific rows in a DataFrame where the 'app' column matches any of the given app_names,
    using the specified MarianMT model. It then saves the updated DataFrame to a CSV file at the specified output_path.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the data to be translated.
    - app_names (list of str): The names of the apps for which the 'content' column will be translated.
    - model_name (str): The MarianMT model name to be used for translation.
    - output_path (str): The file path where the updated DataFrame should be saved as a CSV.
    - batch_size (int, optional): The number of sentences to translate before printing progress. Defaults to 10.

    Returns:
    - None: This function does not return a value but saves the updated DataFrame to a CSV file.

    This function filters the DataFrame for rows corresponding to any of the specified app_names, translates the 'content'
    column of those rows, and then merges the translated content back into the original DataFrame. Finally, it saves
    the updated DataFrame to a CSV file at the specified output_path. Progress is printed every batch_size translations.
    """
    warnings.filterwarnings('ignore')
    translated_dfs = []

    for app_name in app_names:
        # Filter DataFrame for the specified app
        df_app_specific = df[df.app == app_name]
        print(df_app_specific.head())

        # Initialize translation function
        def translate(text, model_name=model_name):
            try:
                tokenizer = MarianTokenizer.from_pretrained(model_name)
                model = MarianMTModel.from_pretrained(model_name)
                tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
                translated = model.generate(**tokens)
                translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
                return translated_text
            except Exception as e:
                print(f"Error in translation: {e}")
                return 'Translation failed'

        # Perform translation
        start_time = time.time()
        translated_text = []
        for index, content in enumerate(df_app_specific.content):
            translated_text.append(translate(content))
            if (index + 1) % batch_size == 0:
                duration = time.time() - start_time
                print(f"Successfully translated {index + 1} sentences for app '{app_name}' in {duration:.2f} seconds")

        # Update DataFrame with translated content
        df_app_specific = df_app_specific.reset_index(drop=True)
        df_app_specific['content'] = pd.Series(translated_text)

        translated_dfs.append(df_app_specific)

    # Concatenate translated DataFrames with the rest of the original DataFrame
    df_non_translated = df[~df.app.isin(app_names)]
    df_updated = pd.concat([df_non_translated] + translated_dfs, ignore_index=True).reset_index(drop=True)

    # Save to CSV
    print(df_updated.head())
    #df_updated.to_csv(output_path, index=False)
    print(f"Dataframe has been saved into {output_path}!")
    return df_updated


### 2.1.1. Polish to English

App: 112 BE

In [75]:
# Calling the parameters
save_dir = '../dataset/phase 2/' 
app_name = ['Alarm112']
model_name = 'Helsinki-NLP/opus-mt-pl-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df, app_name, model_name, output_path)

            app                                     content score
81566  Alarm112                                     Idealny     5
81567  Alarm112                  Kolejne wyłudzanie danych?     1
81568  Alarm112             Dla mnie nienormalna aplikacja.     2
81569  Alarm112  Przydatna potrzebna funkcjonalna. Polecam.     4
81570  Alarm112                                  Wporządku.     5
              app                                            content score
0  Disaster Alert                                           Good App     5
1  Disaster Alert  Working as a Public Health Nurse I get to resp...     5
2  Disaster Alert  Nice to have before traveling to unknown terri...     5
3  Disaster Alert  I like! I'm trying to find anything about tsun...     5
4  Disaster Alert                                       Not accurate     1
Dataframe has been saved into ../dataset/raw data/df_translated_google_play.csv!


### 2.1.2. Dutch to English

App: 112 BE

In [57]:
# Calling the parameters
app_name = ['112 BE']
model_name = 'Helsinki-NLP/opus-mt-nl-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

          app                                            content score
82349  112 BE                                            Handig.     5
82350  112 BE  Vandaag een oproep via de app geplaatst naar 1...     1
82351  112 BE  Voor politiediensten zie ik de meerwaarde eerl...     3
82352  112 BE                               Niets zo gemakkelijk     5
82353  112 BE  Beste is er ergens een optie om de app op het ...     3
          app                                            content score
85717  112 BE                                             Handy.     5
85718  112 BE  Today I put a call through the app to 101. I c...     1
85719  112 BE  For police I honestly do not see the added val...     3
85720  112 BE                                   Nothing so easy.     5
85721  112 BE  Best there is somewhere an option to get the a...     3

Dataframe has been saved into ../dataset/raw data/df_translated_google_play.csv!


### 2.1.3. German to English

App: DEC112 2.0

In [58]:
# Calling the parameters
app_name = ['DEC112 2.0']
model_name = 'Helsinki-NLP/opus-mt-de-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

              app                                            content score
82336  DEC112 2.0                                          Genial.❤️     5
82337  DEC112 2.0  Schon die Registrierung verläuft fehlerhaft.Ei...     4
82338  DEC112 2.0  Die App funktioniert nur online. Wenn mal kein...     3
82339  DEC112 2.0  Super Umsetzung! Ist die App auch für nicht-ge...     5
82340  DEC112 2.0   Endlich eine zweckmässige und intelligente App 👍     5
              app                                            content score
85709  DEC112 2.0                                           Awesome.     5
85710  DEC112 2.0  The registration is already wrong.A confirmati...     4
85711  DEC112 2.0  The app only works online. If there is no net ...     3
85712  DEC112 2.0  Great implementation! Is the app also intended...     5
85713  DEC112 2.0               Finally a useful and intelligent app     5

Dataframe has been saved into ../dataset/raw data/df_translated_google_play.csv!


### 2.1.4. Georgian to English

App: 112 Georgia

In [59]:
# Calling the parameters
app_name = ['112 Georgia']
model_name = 'Helsinki-NLP/opus-mt-ka-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

               app                                            content score
82568  112 Georgia  I used this app to call the police a couple of...     3
82569  112 Georgia  Got stuck after entering personal info and add...     3
82570  112 Georgia  After the recent update I had to re-register f...     5
82571  112 Georgia                    Good idea but have't tasted yet     3
82572  112 Georgia  Its funny a country with an anthem about freed...     1
               app                                            content score
85679  112 Georgia  Icedtea Flasheschendordinatorstor (traditional...     3
85680  112 Georgia  Go to Kontact or to-do of the Free Software Fo...     3
85681  112 Georgia               Attached Identity Identity Identity.     5
85682  112 Georgia                            Google Browser's Module     3
85683  112 Georgia                                    I-Search Backup     1

Dataframe has been saved into ../dataset/raw data/df_translated_google_play.csv!


### 2.1.5. Spanish to English

App: SAC Argentina

In [60]:
# Calling the parameters
app_name = ['SAC Argentina']
model_name = 'Helsinki-NLP/opus-mt-es-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

                 app                                            content score
82568  SAC Argentina             No deja que se actualice y no funciona     1
82569  SAC Argentina  Funciona muy mal no la recomiendo pérdida de p...     1
82570  SAC Argentina                                              Buena     5
82571  SAC Argentina                                             Genial     5
82572  SAC Argentina  Te llaman rápido si la activas y te consultan ...     2
                 app                                            content score
85637  SAC Argentina      It doesn't let it update and it doesn't work.     1
85638  SAC Argentina  It works very badly I do not recommend losing ...     1
85639  SAC Argentina                                              Good.     5
85640  SAC Argentina                                             Great.     5
85641  SAC Argentina  They call you fast if you activate it and they...     2

Dataframe has been saved into ../dataset/raw data/df_translated

### 2.1.6. Bulgarian to English

App: help is nearby!

In [61]:
# Calling the parameters
app_name = ['help is nearby!']
model_name = 'Helsinki-NLP/opus-mt-bg-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

                   app                                            content  \
82568  help is nearby!  нали знаете че може да има хора с натрапници в...   
82569  help is nearby!  6Текстът който копирате автоматично ще се пока...   
82570  help is nearby!                      СУПЕР ОФЕРТА ЗА ВИДЕОЧАТ ПАК.   
82571  help is nearby!  ДАДОХ ДОСТАП НА ВСЯКАДА . А ТО ПРОДАЛЖАЕА ДА М...   
82572  help is nearby!  Щом има приложение то трябва да е достъпно за ...   

      score  
82568     1  
82569     2  
82570     5  
82571     1  
82572     1  
                   app                                            content  \
85618  help is nearby!  you know that there may be people with intrude...   
85619  help is nearby!  6The text you copy will automatically display ...   
85620  help is nearby!       THE SUPER OF THE VIDEOCHATE PAC SUPER OFFER.   
85621  help is nearby!  I've given this to everyone, and I've given it...   
85622  help is nearby!  If there is an app, it should be available t

### 2.1.7. Czech to English

App: Zachranka

In [63]:
# Calling the parameters
app_name = ['Zachranka']
model_name = 'Helsinki-NLP/opus-mt-cs-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

             app                                            content score
82568  Zachranka  Nepodporuje tématické ikony. Vývoj aplikace se...     2
82569  Zachranka  Aplikace se nebyla schopná zapnout ve chvíli k...     1
82570  Zachranka  Dobrý den lze přidat dítě do aplikaci? Plánuje...     4
82571  Zachranka  Poslední update se nepovedl. Chodí mi notifika...     1
82572  Zachranka                 Dos not work without registration.     1
             app                                            content score
85536  Zachranka  It does not support thematic icons. The develo...     2
85537  Zachranka  The app wasn't able to turn on when I needed t...     1
85538  Zachranka  Hello can you add a child to the app? Are you ...     4
85539  Zachranka  Last update failed. I have a Hungarian notific...     1
85540  Zachranka                 Dos not work without registration.     1

Dataframe has been saved into ../dataset/raw data/df_translated_google_play.csv!


### 2.1.8. Hungarian to English

App: EletMento

In [64]:
# Calling the parameters
app_name = ['EletMento']
model_name = 'Helsinki-NLP/opus-mt-hu-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

             app                                            content score
82568  EletMento                                             Kiváló     5
82569  EletMento  Élesben még szerencsére nem kellett használni....     5
82570  EletMento                                             Könnyű     5
82571  EletMento  Jó ötlet szerencsére kipróbálnom még nem kelle...     5
82572  EletMento  Nem lenne rossz az app. Bár én még riasztást n...     5
             app                                            content score
84770  EletMento                                         Excellent.     5
84771  EletMento  Luckily, I didn't have to use it in the first ...     5
84772  EletMento                                         It's easy.     5
84773  EletMento  It's a good idea I didn't have to try it yet! ...     5
84774  EletMento  The app wouldn't be bad, although I haven't be...     5

Dataframe has been saved into ../dataset/raw data/df_translated_google_play.csv!


### 2.1.9. Bahasa to English

App: Jakarta Aman

In [65]:
# Calling the parameters
app_name = ['Jakarta Aman']
model_name = 'Helsinki-NLP/opus-mt-id-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

                app                                            content score
82568  Jakarta Aman  Nomor telpon sampe 13 digit ga bisa ya? Padaha...     1
82569  Jakarta Aman  Selama ini belum bisa memanfaatkan dan memperg...     1
82570  Jakarta Aman  Jakarta Amannya aplikasinya Error ya? Tolong d...     1
82571  Jakarta Aman  Sudah install tetapi Tidak bisa daftar kapan d...     2
82572  Jakarta Aman  masi banyak bug butuh pengembangan lebih serta...     5
                app                                            content score
84596  Jakarta Aman  A phone number of 13 digits won't work, but no...     1
84597  Jakarta Aman  We haven't been able to use and use applicatio...     1
84598  Jakarta Aman  Don't worry. I'm charming because I've tried d...     1
84599  Jakarta Aman  Already installed but can't list when is it fi...     2
84600  Jakarta Aman  A lot of bugs need more development and a high...     5

Dataframe has been saved into ../dataset/raw data/df_translated_google_play

### 2.1.10. Japanesse to English

App: Yurekuru Call and National evacuation center guide

In [76]:
# Calling the parameters
app_name = ['Yurekuru Call', 'National evacuation center guide']
model_name = 'Helsinki-NLP/opus-mt-ja-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

                 app                                            content score
84040  Yurekuru Call                                               ゴミゲー     1
84041  Yurekuru Call  「緊急地震速報」対応が早い！！お陰様で、凄く助かってます。｢ ゆれくるコール ｣お薦めします...     5
84042  Yurekuru Call                             揺れてるのに鳴らない テレビのほうが速報速い     1
84043  Yurekuru Call                                                 退会     2
84044  Yurekuru Call     津波の通知を、段階ごと(津波注意報～大津波警報)もしくは地域ごとに設定できるよう改善願います     2
                                    app              content score
85658  National evacuation center guide          触ってないのに勝手に…     1
85659  National evacuation center guide  分からないですが、宜しくお願いします。     4
85660  National evacuation center guide  避難所がわかって助かるあぷりです助かる     5
85661  National evacuation center guide               起動しない。     1
85662  National evacuation center guide         作動環境を教えてください     3
              app                                            content score
0  Disaster Alert                                      

### 2.1.11. Mongolian to English

App: Anhaar

In [77]:
# Calling the parameters
app_name = ['Anhaar']
model_name = 'Helsinki-NLP/opus-mt-mul-en'
output_path = os.path.join(save_dir, 'df_translated_google_play.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

          app                                    content score
84040  Anhaar                    Гэрэл нь ажлахгүй байна     4
84041  Anhaar        LG загваруудад гэрэл асахгүй байна.     3
84042  Anhaar                                    So cool     5
84043  Anhaar  Үнэхээр хэрэгтэй апп байна. Баярлалаа 👏👏👏     5
84044  Anhaar                            QR code bdag u?     4


tokenizer_config.json: 100%|█████████████████████████████████████████████████| 44.0/44.0 [00:00<00:00, 8.85kB/s]
source.spm: 100%|████████████████████████████████████████████████████████████| 707k/707k [00:00<00:00, 1.99MB/s]
target.spm: 100%|████████████████████████████████████████████████████████████| 791k/791k [00:00<00:00, 3.98MB/s]
vocab.json: 100%|██████████████████████████████████████████████████████████| 1.42M/1.42M [00:01<00:00, 1.39MB/s]
config.json: 100%|██████████████████████████████████████████████████████████| 1.40k/1.40k [00:00<00:00, 603kB/s]
pytorch_model.bin: 100%|█████████████████████████████████████████████████████| 310M/310M [01:32<00:00, 3.37MB/s]
generation_config.json: 100%|██████████████████████████████████████████████████| 293/293 [00:00<00:00, 65.0kB/s]


              app                                            content score
0  Disaster Alert                                           Good App     5
1  Disaster Alert  Working as a Public Health Nurse I get to resp...     5
2  Disaster Alert  Nice to have before traveling to unknown terri...     5
3  Disaster Alert  I like! I'm trying to find anything about tsun...     5
4  Disaster Alert                                       Not accurate     1
Dataframe has been saved into ../dataset/raw data/df_translated_google_play.csv!


## 2.2. Apple App Store

In [78]:
file_path = os.path.join(save_dir, 'apple_app_reviews.csv')
df = pd.read_csv(file_path)

### 2.2.1. Polish to English

App: 112 BE

In [None]:
# Calling the parameters
app_name = ['Alarm112']
model_name = 'Helsinki-NLP/opus-mt-pl-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df, app_name, model_name, output_path)

### 2.2.2. Dutch to English

App: 112 BE

In [None]:
# Calling the parameters
app_name = ['112 BE']
model_name = 'Helsinki-NLP/opus-mt-nl-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

### 2.2.3. Bulgarian to English

App: 112 Bulgaria

In [None]:
# Calling the parameters
app_name = ['112 Bulgaria']
model_name = 'Helsinki-NLP/opus-mt-bg-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

### 2.2.4. Japanesse to English

App: PREP

In [None]:
# Calling the parameters
app_name = ['PREP']
model_name = 'Helsinki-NLP/opus-mt-ja-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

### 2.2.5. Norwegian to English

App: Hjelp 113

#### 2.2.5.1 Norwegian to Dutch

In [None]:
# Calling the parameters
app_name = ['Hjelp 113']
model_name = 'Helsinki-NLP/opus-mt-no-nl'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

#### 2.2.5.2. Dutch to English

In [None]:
# Calling the parameters
app_name = ['Hjelp 113']
model_name = 'Helsinki-NLP/opus-mt-nl-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

### 2.2.6. Swedish to English

App: SOS Alarm

In [None]:
# Calling the parameters
app_name = ['SOS Alarm']
model_name = 'Helsinki-NLP/opus-mt-sv-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

### 2.2.7. Swiss to English

App: Alertswiss

#### 2.2.7.1. German to English

In [None]:
# Calling the parameters
app_name = ['Alertswiss']
model_name = 'Helsinki-NLP/opus-mt-de-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

#### 2.2.7.2. French to English

In [None]:
# Calling the parameters
app_name = ['Alertswiss']
model_name = 'Helsinki-NLP/opus-mt-fr-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

### 2.2.8. Turkish to English

App: Afad Acil Cagri

In [None]:
# Calling the parameters
app_name = ['Afad Acil Cagri']
model_name = 'Helsinki-NLP/opus-mt-tr-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)

### 2.2.9. Spanish to English

App: Earthquake and My Earthquake Alerts & Feed

In [None]:
# Calling the parameters
app_name = ['Earthquake', 'My Earthquake Alerts & Feed']
model_name = 'Helsinki-NLP/opus-mt-es-en'
output_path = os.path.join(save_dir, 'df_translated_app_store.csv')

# Calling the function
df_updated = translate_content_and_save(df_updated, app_name, model_name, output_path)