In [1]:
import pandas as pd
import re
import os
import fasttext
import progressbar
import json
import gzip
import shutil
from nltk.tokenize import word_tokenize
fasttext.FastText.eprint = lambda x: None # avoid Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.

In [2]:
path_parent = os.path.dirname(os.getcwd())
pretrained_fasttext_path = os.path.join(path_parent, 'src/models/lid.176.bin')
model = fasttext.load_model(pretrained_fasttext_path)

In [39]:
text = "Das$ ist ein de2utscher 2text wt fe2lern"

In [14]:
text

'This$ is a may2be an english 2text'

In [40]:
model.predict(text)

(('__label__de',), array([0.99296874]))

In [28]:
cleaned_data_path

'/work-ceph/bizer-tp2021/data_integration_using_deep_learning/src/data/product/LocalBusiness_top100/cleaned_new_threshold'

In [41]:
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [9]:
def remove_with_fasttext():
    """
    reads all files from cleaned data path and removes non-english products from the data tables
    :return:
    """
    pretrained_fasttext_path = os.path.join(path_parent, 'src/models/lid.176.bin')
    model = fasttext.load_model(pretrained_fasttext_path)

    # list all top 100 product files
    files = [file for file in os.listdir(data_path) if file.endswith('.json.gz')]

    removed_rows_dict = {}

    count_files = 0

    for file in files:
        print(file)
        df = pd.read_json(os.path.join(data_path, '{}'.format(file)), compression='gzip', lines=True)
        if df.shape[0] > 20: # for top100 & min3
        #if df.shape[0] > 0: # for rest only
            df['concat'] = ''

            for j in range(df.shape[1]):  # iterate over columns
                df['concat'] = df['concat'] + df.iloc[:,j].astype('str')
                

            # iterrate over rows and save row_ids of english products
            english_products = []
            non_english_products = []
            count = 0
            with progressbar.ProgressBar(max_value=df.shape[0]) as bar:
                for i in range(df.shape[0]):  # iterate over rows
                    row_id = int(df['row_id'][i])
                    cell = df['concat'][i]
                    cell_new = re.sub('[^A-Za-z_\s]', ' ', cell)
                    cell_pred = model.predict([cell])
                    cell_new_pred = model.predict([cell_new])
                    print(cell_new)
                    print("no_cleaning", cell_pred)
                    print("cleaning", cell_new_pred)
                    if cell_pred == [['__label__en']]:
                        english_products.append(row_id)
                    else:
                        non_english_products.append(row_id)
                    count += 1
                    bar.update(count)

In [10]:
data_path = os.path.join(path_parent, 'src/data/LocalBusiness/LocalBusiness_top100')

In [11]:
remove_with_fasttext()

LocalBusiness_sparkasse.de_September2020.json.gz


  0% (40 of 12312) |                     | Elapsed Time: 0:00:00 ETA:   0:00:30

 Kreissparkasse Bautzen   Geldautomat Weigsdorf K blitz  postalcode             addresslocality    Weigsdorf K blitz    streetaddress    Kaufhallenweg  a               kundenservice ksk bautzen de  latitude                longitude               Th                Mo                Sa                Su                Tu                We                Fr              Nonehttps   www sparkasse de geldautomaten w kreissparkasse bautzen geldautomat weigsdorf koeblitz        html Kreissparkasse Bautzen   Geldautomat Weigsdorf K blitz  postalcode             addresslocality    Weigsdorf K blitz    streetaddress    Kaufhallenweg  a               kundenservice ksk bautzen de  latitude                longitude               Th                Mo                Sa                Su                Tu                We                Fr              Nonehttps   www sparkasse de geldautomaten w kreissparkasse bautzen geldautomat weigsdorf koeblitz        html
no_cleaning ([['__label__de']], [array(

  0% (110 of 12312) |                    | Elapsed Time: 0:00:00 ETA:   0:00:34

cleaning ([['__label__de']], [array([0.41738257], dtype=float32)])
  Kreissparkasse Freudenstadt   Geldautomat Salzstetten  addresslocality    Waldachtal    postalcode             streetaddress    Biergasse                  info ksk fds de  latitude                  longitude                 Th                Mo                Sa                Su                Tu                We                Fr              Nonehttps   www sparkasse de geldautomaten w kreissparkasse freudenstadt geldautomat salzstetten       html  Kreissparkasse Freudenstadt   Geldautomat Salzstetten  addresslocality    Waldachtal    postalcode             streetaddress    Biergasse                  info ksk fds de  latitude                  longitude                 Th                Mo                Sa                Su                Tu                We                Fr              Nonehttps   www sparkasse de geldautomaten w kreissparkasse freudenstadt geldautomat salzstetten       html
no_cleaning ([['__

  1% (191 of 12312) |                    | Elapsed Time: 0:00:00 ETA:   0:00:32


cleaning ([['__label__de']], [array([0.262403], dtype=float32)])
   Sparkasse Hannover   SB Center Mardorf  postalcode             streetaddress    Mardorfer Stra e       addresslocality    Neustadt               info sparkasse hannover de  latitude                  longitude                NoneNonehttps   www sparkasse de filialen n sparkasse hannover sb center mardorf        html   Sparkasse Hannover   SB Center Mardorf  postalcode             streetaddress    Mardorfer Stra e       addresslocality    Neustadt               info sparkasse hannover de  latitude                  longitude                NoneNonehttps   www sparkasse de filialen n sparkasse hannover sb center mardorf        html
no_cleaning ([['__label__en']], [array([0.27216914], dtype=float32)])
cleaning ([['__label__fr']], [array([0.28652114], dtype=float32)])
   Kreissparkasse Birkenfeld   Geldautomat Weierbach  addresslocality    Idar Oberstein    postalcode             streetaddress    Weierbacher Stra e         

  2% (267 of 12312) |                    | Elapsed Time: 0:00:00 ETA:   0:00:32

([['__label__de']], [array([0.23265083], dtype=float32)])
cleaning ([['__label__de']], [array([0.39135674], dtype=float32)])
   Sparkasse Gummersbach   Geldautomat Drabenderh her Stra e  streetaddress    Drabenderh her Stra e       addresslocality    Wiehl    postalcode                      info sparkasse gm de  latitude                  longitude                  Fr                Su                We                Th                Mo                Tu                Sa              Nonehttps   www sparkasse de geldautomaten w sparkasse gummersbach geldautomat drabenderhoeher strasse       html   Sparkasse Gummersbach   Geldautomat Drabenderh her Stra e  streetaddress    Drabenderh her Stra e       addresslocality    Wiehl    postalcode                      info sparkasse gm de  latitude                  longitude                  Fr                Su                We                Th                Mo                Tu                Sa              Nonehttps   www sparkasse de g

  2% (338 of 12312) |                    | Elapsed Time: 0:00:00 ETA:   0:00:32

cleaning ([['__label__de']], [array([0.4196948], dtype=float32)])
   Sparkasse K lnBonn   Geldautomat Auerberg  postalcode             streetaddress    K lnstra e        addresslocality    Bonn                  kontakt sparkasse koelnbonn de  longitude               latitude               Fr                Su                We                Th                Mo                Tu                Sa              Nonehttps   www sparkasse de geldautomaten b sparkasse koelnbonn geldautomat auerberg       html   Sparkasse K lnBonn   Geldautomat Auerberg  postalcode             streetaddress    K lnstra e        addresslocality    Bonn                  kontakt sparkasse koelnbonn de  longitude               latitude               Fr                Su                We                Th                Mo                Tu                Sa              Nonehttps   www sparkasse de geldautomaten b sparkasse koelnbonn geldautomat auerberg       html
no_cleaning ([['__label__en']], [array([0.217

  3% (418 of 12312) |                    | Elapsed Time: 0:00:01 ETA:   0:00:32


no_cleaning ([['__label__de']], [array([0.33778393], dtype=float32)])
cleaning ([['__label__de']], [array([0.55510634], dtype=float32)])
   Frankfurter Sparkasse   Geldautomat Enkheim  Bergen Enkheim   postalcode             addresslocality    Frankfurt Bergen Enkheim    streetaddress    Triebstr                     info frankfurter sparkasse de  latitude                  longitude                 Fr                Su                We                Th                Mo                Tu                Sa              Nonehttps   www sparkasse de geldautomaten f frankfurter sparkasse geldautomat enkheim bergen enkheim       html   Frankfurter Sparkasse   Geldautomat Enkheim  Bergen Enkheim   postalcode             addresslocality    Frankfurt Bergen Enkheim    streetaddress    Triebstr                     info frankfurter sparkasse de  latitude                  longitude                 Fr                Su                We                Th                Mo                Tu      

  4% (536 of 12312) |                    | Elapsed Time: 0:00:01 ETA:   0:00:31


no_cleaning ([['__label__en']], [array([0.21200392], dtype=float32)])
cleaning ([['__label__de']], [array([0.26979718], dtype=float32)])
   Kreissparkasse Kaiserslautern   Geldautomat Hauptstuhl  postalcode             addresslocality    Hauptstuhl    streetaddress    Kaiserstra e                  posteingang kskkaiserslautern de  latitude                  longitude                Th                Mo                Sa                Su                Tu                We                Fr              Nonehttps   www sparkasse de geldautomaten h kreissparkasse kaiserslautern geldautomat hauptstuhl        html   Kreissparkasse Kaiserslautern   Geldautomat Hauptstuhl  postalcode             addresslocality    Hauptstuhl    streetaddress    Kaiserstra e                  posteingang kskkaiserslautern de  latitude                  longitude                Th                Mo                Sa                Su                Tu                We                Fr              Nonehttps  




   Sparkasse Saarbr cken   Geldautomat Bildstock  postalcode             addresslocality    Friedrichsthal    streetaddress    Illinger Stra e  a               service sparkasse saarbruecken de  longitude               latitude               Sa                Tu                Fr                Su                We                Mo                Th              Nonehttps   www sparkasse de geldautomaten f sparkasse saarbruecken geldautomat bildstock        html   Sparkasse Saarbr cken   Geldautomat Bildstock  postalcode             addresslocality    Friedrichsthal    streetaddress    Illinger Stra e  a               service sparkasse saarbruecken de  longitude               latitude               Sa                Tu                Fr                Su                We                Mo                Th              Nonehttps   www sparkasse de geldautomaten f sparkasse saarbruecken geldautomat bildstock        html
no_cleaning ([['__label__de']], [array([0.1917593], dtype=float32

KeyboardInterrupt: 