In [1]:
import json
import os
from googletrans import Translator
from typing import Dict
import pandas as pd
import time
import importlib
#import googletrans #import the module here, so that it can be reloaded.
#importlib.reload(googletrans)

translator = Translator()


In [2]:
matching_file = "../../data/dataset/all_matching.json"
ios_top_apps = "../../data/dataset/ios/ios_popular.txt"
ios_random_apps = "../../data/dataset/ios/ios_random.txt"

all_results =  "../../data/plist_results/2024_01_20_all.njson"


In [12]:
def read_njson_file(path):
    result = []
    with open(path, "r") as f:
        for line in f.readlines():
            current = json.loads(line)
            result.append(current)
            
            
    return result
        

In [13]:
def get_id_from_app_string(app:str):
    # _ could be withing a app name invalidating the split
    filename = os.path.basename(app)
    filename = filename.replace(".ipa", "")
    filename_splitted = filename.split("_")
    return filename_splitted[0]

def parse_app(app_json: Dict):
    app_id = get_id_from_app_string(app_json.get("app", ""))
    result = {}
    result["app_id"] = app_id
    lang_result = {}
    for key1, value1 in app_json.items():
        if key1 == "app":
            continue
        elif key1 == "error":
            continue
        for key2, value2 in value1.items():
            if key2 != "NSLocalNetworkUsageDescription":
                continue
            lang = ""
            if os.path.basename(key1) == "Info.plist":
                lang = "default"
            else:
                lang = os.path.basename(os.path.dirname(key1)).replace(".lproj", "")
            
            if lang:
                lang_result[lang] = value2
                    
    if lang_result:
        result["languages"] = lang_result
    
    return result
        
    
def parse_dataset(dataset, app_list = []):
    result = []
    added_apps = set()
    for app in dataset:
        app_id = get_id_from_app_string(app.get("app", ""))
        if app_id in app_list and app_id not in added_apps: 
            result.append(parse_app(app))
            added_apps.add(app_id)
    return result
    

In [14]:
def get_german_language(languages):
    if "de" in languages:
        return languages["de"]
    elif "de-DE" in languages:
        return languages["de-DE"]
    elif "de-AT" in languages:
        return languages["de-AT"]
    elif "de-CH" in languages:
        return languages["de-CH"]

    elif get_default(languages) == "de":
        return languages["default"]
    
    for key,value in languages.items():
        if key.startswith("de-"):
            return value
    
    return "" 

def get_english_language(languages):
    if "en" in languages:
        return languages["en"]
    elif "en-US" in languages:
        return languages["en-US"]
    elif "en-GB"  in languages:
        return languages["en-GB"]
    elif "en-AU"  in languages:
        return languages["en-AU"]
    elif get_default(languages) == "en":
        return languages["default"]
    
    for key,value in languages.items():
        if key.startswith("en-"):
            return value
        
    return ""


def get_default(languages):
    global translator
    for i in range(0, 5):
        try:
            if "default" in languages:
                translated = translator.detect(languages["default"])
                return translated.lang
        except TypeError:
            time.sleep(5)
        except AttributeError:
            time.sleep(5)
        except:
            time.sleep(5)
    translator = Translator()
    return None
    
    
def get_translated(languages):
    global translator
    if "default" in languages:
        for i in range(0,5):
            try:
                translated =  translator.translate(languages["default"], dest='en', src='auto')
                if translated and translated.text:
                    return translated.text
            except TypeError:
                time.sleep(5)
            except AttributeError:
                time.sleep(5)
        translator = Translator()

    
    for key, value in languages.items():
        for i in range(0,5):
            try:
                translated =translator.translate(value, dest='en', src='auto')
                if translated and translated.text:
                    return translated.text
            except TypeError:
                time.sleep(5)
            except AttributeError:
                time.sleep(5)
        translator = Translator()
    return None
        

    
    
def create_raw_table(dataset):
    result = {"app_id": [], "de": [], "en": [], "translate": []}
    for app in dataset:
        if "languages" in app:
            result.get("app_id").append(app["app_id"])
            result.get("de").append(get_german_language(app["languages"]))
            result.get("en").append(get_english_language(app["languages"]))
            result.get("translate").append(get_translated(app["languages"]))
    return result

    
def get_dataset_ids_from_file(path: str):
    """
    Retrieves the iOS dataset from the given file path.

    Args:
        path (str): The path of the dataset.

    Returns:
        List[str]: The list of iOS app IDs.
    """
    result = []
    with open(path, "r") as f:
        for line in f:
            result.append(line.strip())
    return result   


In [15]:
def create_table(raw_data):
    return pd.DataFrame(raw_data)

In [8]:
with open(matching_file, "r") as f:
    matching = json.load(f)

In [9]:
ios_random_ids = get_dataset_ids_from_file(ios_random_apps)
ios_top_ids = get_dataset_ids_from_file(ios_top_apps)
ios_matching_ids = matching.keys()

In [None]:
permission = read_njson_file(all_results)
dataset = parse_dataset(permission,ios_top_ids + ios_random_ids)
translated_dataset = create_raw_table(dataset)
pd_df = create_table(translated_dataset)
pd_df.to_csv("../../data/plist_results/all.csv",index=False, sep=";")

In [None]:
permission = read_njson_file(all_results)
dataset = parse_dataset(permission,ios_matching_ids)
translated_dataset = create_raw_table(dataset)
pd_df = create_table(translated_dataset)
pd_df.to_csv("../../data/plist_results/matching.csv",index=False, sep=";")