#**Imports and Spark Installation**#

In [1]:
!pip install pyspark



In [2]:
import pyspark
import requests
from bs4 import BeautifulSoup
import csv
from tqdm import tqdm
import time
import pandas as pd
import re
import hashlib
import numpy as np
import itertools

#**Load Data**#

In [3]:
url = 'https://www.kijiji.it/offerte-di-lavoro/offerta/informatica-e-web/'

In [4]:
def get_data_from_url(url, num_pages):

  new_url = url

  with open('/tmp/output.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['Title', 'Description', 'Location', 'Publication Date', 'URL'])

    for i in tqdm(range(1, num_pages+1)):
      if i > 1:
        new_url = url + "?p=" + str(i)

      r = requests.get(new_url)
      soup = BeautifulSoup(r.content)
      g_data = soup.find_all("div", {"class": "item-content"})
      for item in g_data:
        tsv_writer.writerow([item.contents[1].find_all("a", {"class": "cta"})[0].text.strip(),
                             item.contents[3].text, 
                             item.contents[7].text, item.contents[9].text,
                             item.contents[1].find_all("a", {"class": "cta"})[0].get('href')])
      time.sleep(0.75)

#152 is the number of pages with job announcements on the kijiji web page in the Informatica/Grafica/Web sector     
get_data_from_url(url, 152)

100%|██████████| 152/152 [05:22<00:00,  2.12s/it]


In [5]:
tsv_file = open("/tmp/output.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

for i, row in enumerate(read_tsv):
  print(row)
  if i == 5:
    break


['Title', 'Description', 'Location', 'Publication Date', 'URL']
['Editor Premiere - videografico-a After Effects', 'Editor video con con approfondita e documentata esperienza si ricerca per posizione interna alla struttura; si richiede 1) conoscenza approfondita di Premiere e After Effects, basilare conoscenza di Davinci, residenza entro 15 km da Novara\nSe interessati e in possesso di tutti i requisiti inviare un curriculum a Job AT giuseppegalliano.it e link a proprio show reel.', 'Novara', '31 agosto, 19:02', 'https://www.kijiji.it/annunci/offerta/messina-annunci-novara-di-sicilia/editor-premiere-videografico-a-after-effects/159299183']
['Programmatori junior - neolaureati/neodiplomati', "Visioture - Digital Technology Company, ricerca:\n\nPROGRAMMATORE JUNIOR - neodiplomati/neolaureati \n\nHai appena terminato gli studi e sei laureato (magistrale o triennale) in Scienze (Informatica, Matematica, Fisica, Statistica, Geologia), Ingegneria o Economia e hai acquisito competenze di info

In [6]:
df = pd.read_csv('/tmp/output.tsv', delimiter = '\t')
df.count()

Title               3189
Description         3189
Location            3189
Publication Date    3189
URL                 3189
dtype: int64

In [7]:
df.head()

Unnamed: 0,Title,Description,Location,Publication Date,URL
0,Editor Premiere - videografico-a After Effects,Editor video con con approfondita e documentat...,Novara,"31 agosto, 19:02",https://www.kijiji.it/annunci/offerta/messina-...
1,Programmatori junior - neolaureati/neodiplomati,"Visioture - Digital Technology Company, ricerc...",Flaminio / Parioli / Pinciano,"Ieri, 22:12",https://www.kijiji.it/annunci/offerta/roma-ann...
2,Responsabile Marketing,"Rossato Group Srl, per rafforzamento e struttu...",Sermoneta,"Ieri, 17:57",https://www.kijiji.it/annunci/offerta/latina-a...
3,Lavoro Smart working,AZIENDA ITALIANA CERCASI PERSONALE in tutta It...,Torrenova,"Ieri, 17:48",https://www.kijiji.it/annunci/offerta/messina-...
4,Tecnico Reti Di Telecomunicazione,Tecnico Reti Di Telecomunicazione\nCercasi tec...,Cosenza,"Ieri, 14:53",https://www.kijiji.it/annunci/offerta/cosenza-...


In [8]:
df_with_description = df[df['Description'] != '\n\nClicca sul link sottostante "sito web" per inviarci la tua candidatura.']
df_with_description.count()

Title               1728
Description         1728
Location            1728
Publication Date    1728
URL                 1728
dtype: int64

Now we peform column preprocessing (using part of the preprocessing function, created for the Problem 1) in order to provide

**properly separated** file to Spark.

In [9]:
def column_preprocessing(col):
  col = col.apply(lambda x: x.lower()) #1.1
  col = col.apply(lambda x: re.sub('\n', ' ', x)) #1.2.6
  col = col.apply(lambda x: re.sub(' +', ' ', x)) #1.2.7
  return col

In [10]:
np.savetxt(r'/tmp/description.txt', column_preprocessing(df_with_description['Description']).values, fmt='%s')

In [11]:
sc = pyspark.SparkContext('local[*]') # Create a Spark context

In [12]:
txt = sc.textFile('/tmp/description.txt') # Create an RDD

# Check that file has been loaded properly
print(txt.count())

1728


In [13]:
txt.take(5)

['editor video con con approfondita e documentata esperienza si ricerca per posizione interna alla struttura; si richiede 1) conoscenza approfondita di premiere e after effects, basilare conoscenza di davinci, residenza entro 15 km da novara se interessati e in possesso di tutti i requisiti inviare un curriculum a job at giuseppegalliano.it e link a proprio show reel.',
 "visioture - digital technology company, ricerca: programmatore junior - neodiplomati/neolaureati hai appena terminato gli studi e sei laureato (magistrale o triennale) in scienze (informatica, matematica, fisica, statistica, geologia), ingegneria o economia e hai acquisito competenze di informatica? ti piace imparare e sperimentare cose diverse, valutare, creare e gestire le applicazioni it? visioture sta selezionando nuove risorse motivate da inserire in uno stage formativo di due mesi con successiva certificazione salesforce propedeutica all'assunzione ed il placement in consulenza su nostri primari system integrato

In [14]:
txt = sc.textFile('/tmp/description.txt').zipWithIndex()
txt.take(10)

[('editor video con con approfondita e documentata esperienza si ricerca per posizione interna alla struttura; si richiede 1) conoscenza approfondita di premiere e after effects, basilare conoscenza di davinci, residenza entro 15 km da novara se interessati e in possesso di tutti i requisiti inviare un curriculum a job at giuseppegalliano.it e link a proprio show reel.',
  0),
 ("visioture - digital technology company, ricerca: programmatore junior - neodiplomati/neolaureati hai appena terminato gli studi e sei laureato (magistrale o triennale) in scienze (informatica, matematica, fisica, statistica, geologia), ingegneria o economia e hai acquisito competenze di informatica? ti piace imparare e sperimentare cose diverse, valutare, creare e gestire le applicazioni it? visioture sta selezionando nuove risorse motivate da inserire in uno stage formativo di due mesi con successiva certificazione salesforce propedeutica all'assunzione ed il placement in consulenza su nostri primari system i

# **Hash Functions and Global Variables**

In [15]:
def hashFamily(i):
  resultSize = 8
  maxLen = 20 
  salt = str(i).zfill(maxLen)[-maxLen:]
  def hashMember(x):
    return int(hashlib.sha1(x.encode('utf-8') + salt.encode('utf-8')).hexdigest()[-resultSize:], 16)
  return hashMember

In [16]:
#global variables: set the same as in Problem 2
k = 10
t = 100
b = 10
r = int(t/b)
s = 0.8

# **Shingling**

With a power of Spark we can easily represent all the dataset as a dataset of couples `(index, shingle)`, where `index` $-$ is an **index of the document**, to which this shingle belongs, so we can always get the entire $k$-shingle set, corresponding to a particular document.  

In [17]:
start_time_pipeline = time.time()
shingles = txt.flatMap(lambda line: [(line[1], shingle) for shingle in set([line[0][i:i+k] for i in range(0, len(line[0]) - k + 1)])])

shingles.take(10)

[(0, 'so di tutt'),
 (0, 'fondita di'),
 (0, 'enza si ri'),
 (0, 'a si ricer'),
 (0, 'asilare co'),
 (0, 'ata esperi'),
 (0, ' link a pr'),
 (0, ' e after e'),
 (0, 'e after ef'),
 (0, 'ondita e d')]

Here we perfom a mapping to change the key-value paris to a new set, in which the **value is a hash** of a given $k$-shingle.

In [18]:
h = hashFamily(1)
hashed_shingles = shingles.map(lambda line: (line[0], h(line[1])))

hashed_shingles.take(10)

[(0, 3144206161),
 (0, 2402504238),
 (0, 176612626),
 (0, 699448986),
 (0, 4018840556),
 (0, 4083956041),
 (0, 3703487612),
 (0, 946062341),
 (0, 3532600866),
 (0, 768497043)]

# **Minwise Hashing**

We start from the input in the form of $k$-shingle sets. 


First, we transform `(shingle, index)` couples into `((index, hash_function_index), hash)` couples, where

>`index` $-$ again an index of the document in the dataset, 

>`hash_function_index` $-$ an index of a family member from the hash functions family,

>`hash` $-$ a hash of a given shingle after application of the current family member. 

Therefore, key-value `((index, hash_function_index), hash)` couple with the same `(index, hash_function_index)` key exists as many times as the number of shingles in the $k$-shingles set of the document with `index`.

In [19]:
hashed_family_shingles = shingles.flatMap(lambda line: [((line[0], i), h(line[1])) for (i, h) in [(i, hashFamily(i)) for i in range(1, t+1)]])

hashed_family_shingles.take(15)

[((0, 1), 3144206161),
 ((0, 2), 3222467061),
 ((0, 3), 2139456305),
 ((0, 4), 505994397),
 ((0, 5), 349000194),
 ((0, 6), 3876889083),
 ((0, 7), 2697778568),
 ((0, 8), 2889832540),
 ((0, 9), 883038693),
 ((0, 10), 1392102110),
 ((0, 11), 3877287351),
 ((0, 12), 3249362571),
 ((0, 13), 2624492449),
 ((0, 14), 372305720),
 ((0, 15), 4279738365)]

Now for each document for the fixed hash family member we compute minimum has (i.e. grouping by key, we compute **minimum hash value over all hashed shingles**). 


As the result, we get `((index, hash_function_index), min_hash)` couples, where

> `hash_function_index` $-$ an index of an element in the signature vector,

>` min_hash` $-$ an element of the signature vector with index `hash_function_index`, 

> `index` $-$ an index of the document, which signature vector containing ` min_hash` with `hash_function_index` index corresponds to. 

Therefore, output that we got is in fact a **signature matrix.**

In [20]:
start_time_sig = time.time()
signatures = hashed_family_shingles.reduceByKey(lambda line1, line2 : min(line1,line2)).sortByKey()
print("Signature matrix has been computed within {} seconds.\n".format(round(time.time()-start_time_sig, 3)))
signatures.take(10)

Signature matrix has been computed within 658.217 seconds.



[((0, 1), 26345623),
 ((0, 2), 38050713),
 ((0, 3), 14601143),
 ((0, 4), 4072488),
 ((0, 5), 12636253),
 ((0, 6), 1374495),
 ((0, 7), 3767497),
 ((0, 8), 30256053),
 ((0, 9), 8388411),
 ((0, 10), 6990399)]

# **LSH**

##**Divide Signature Matrix**

First step is to **divide signature matrix** into $b$ bands with $r$ rows. Using a helper function to define band given index of the vector element, we replace `((index, hash_function_index), min_hash)` tuple with  `((index, band), min_hash)`.

In [21]:
def define_band(sig_vec_idx):
  for band, i in enumerate(range(r, t+r, r)):
    if sig_vec_idx <= i:
      return band+1

In [22]:
start_time_lsh = time.time()
bands = signatures.flatMap(lambda line: [((line[0][0], define_band(line[0][1])), line[1])])

bands.take(10)

[((0, 1), 26345623),
 ((0, 1), 38050713),
 ((0, 1), 14601143),
 ((0, 1), 4072488),
 ((0, 1), 12636253),
 ((0, 1), 1374495),
 ((0, 1), 3767497),
 ((0, 1), 30256053),
 ((0, 1), 8388411),
 ((0, 1), 6990399)]

##**Hash**

Now, what we want to do is to **compute for each band a hash function** that takes vectors of $r$ elements (the portion of one column within that band) and hashes them. But since our hashing function takes a string as an input, we will do it in two steps: 

1) **Cast** type of `min_hash` to string and **concatenate** them over band; 

2) Apply **hashing**.

In [23]:
bands_reduced = bands.reduceByKey(lambda line1, line2 : str(line1)+str(line2)).sortByKey()

bands_reduced.take(10)

[((0, 1),
  '263456233805071314601143407248812636253137449537674973025605383884116990399'),
 ((0, 2),
  '629946411206475150302305133621380208402999448582523671992352611437023719829'),
 ((0, 3),
  '168093511895970623587879729321027355082697751026027421829524384872269983296'),
 ((0, 4),
  '277158961311768128908861147054737870590170337119326317108982445417887395'),
 ((0, 5),
  '110856498959964724311931827677816711910291217759508647243450253949027967445'),
 ((0, 6),
  '146118154624032803348710773470108166851297778564995292382463811770429268774'),
 ((0, 7),
  '2758537614722289283989557150076293604126044563060580888970183610711702635'),
 ((0, 8),
  '3254113175041908211964780832293183625046030649799707256390162426216375618'),
 ((0, 9),
  '1517363110404544151879371554058425649280143005794317217424305444525911612050'),
 ((0, 10),
  '9285571115273381632709833454282665208421148284796126214371928437211911811')]

As the result, we obtain tuples `((index, band), band_hash)` where each document is represented inside each band as just one number `band_hash`.

In [24]:
bands_hashed = bands_reduced.map(lambda line: (line[0], h(line[1])))

bands_hashed.take(10)

[((0, 1), 184215864),
 ((0, 2), 1077249376),
 ((0, 3), 446555541),
 ((0, 4), 2358882946),
 ((0, 5), 1558140311),
 ((0, 6), 128960355),
 ((0, 7), 1380435980),
 ((0, 8), 1405268789),
 ((0, 9), 539037899),
 ((0, 10), 2884072399)]

##**Find Candidates**

Next goal is to find inside each band which documents should become **candidates**, i.e. which documents got the same `hash_band` value. 

Therefore, for each `band` and each `band_hash`, we want to **count**, how many documents have it, and **filter out** all the documents which were hashed uniquely inside each band. Then, documents which were not filtered out, become candidates.

    Step 1. Rearrange key-value pairs in a way appropriate for counting. 

In [25]:
changed_keys = bands_hashed.map(lambda line: ((line[0][1], line[1]), line[0][0]))

changed_keys.take(10)

[((1, 184215864), 0),
 ((2, 1077249376), 0),
 ((3, 446555541), 0),
 ((4, 2358882946), 0),
 ((5, 1558140311), 0),
 ((6, 128960355), 0),
 ((7, 1380435980), 0),
 ((8, 1405268789), 0),
 ((9, 539037899), 0),
 ((10, 2884072399), 0)]

    Step 2. Replace document index with 1 in order to sum up rows by key then.

In [26]:
count_candidates = changed_keys.map(lambda line: (line[0], 1))

count_candidates.take(10)

[((1, 184215864), 1),
 ((2, 1077249376), 1),
 ((3, 446555541), 1),
 ((4, 2358882946), 1),
 ((5, 1558140311), 1),
 ((6, 128960355), 1),
 ((7, 1380435980), 1),
 ((8, 1405268789), 1),
 ((9, 539037899), 1),
 ((10, 2884072399), 1)]

    Step 3. Perform counting.

In [27]:
counted_candidates = count_candidates.reduceByKey(lambda line1, line2: line1+line2)

counted_candidates.take(10)

[((2, 1077249376), 153),
 ((3, 446555541), 153),
 ((4, 2358882946), 153),
 ((5, 1558140311), 153),
 ((9, 539037899), 153),
 ((7, 2126668039), 19),
 ((8, 3283088510), 19),
 ((10, 4239847336), 19),
 ((2, 2910322450), 19),
 ((8, 3233294692), 19)]

    Step 4. Retrieve back indices of documents.

In [28]:
documents_with_counts = changed_keys.join(counted_candidates)

documents_with_counts.take(10)

[((2, 1077249376), (0, 153)),
 ((2, 1077249376), (15, 153)),
 ((2, 1077249376), (23, 153)),
 ((2, 1077249376), (29, 153)),
 ((2, 1077249376), (48, 153)),
 ((2, 1077249376), (64, 153)),
 ((2, 1077249376), (71, 153)),
 ((2, 1077249376), (92, 153)),
 ((2, 1077249376), (113, 153)),
 ((2, 1077249376), (130, 153))]

    Step 5. Filter out non-duplicate documents.

In [29]:
candidates = documents_with_counts.filter(lambda line: line[1][1]>1)

candidates.take(10)

[((2, 1077249376), (0, 153)),
 ((2, 1077249376), (15, 153)),
 ((2, 1077249376), (23, 153)),
 ((2, 1077249376), (29, 153)),
 ((2, 1077249376), (48, 153)),
 ((2, 1077249376), (64, 153)),
 ((2, 1077249376), (71, 153)),
 ((2, 1077249376), (92, 153)),
 ((2, 1077249376), (113, 153)),
 ((2, 1077249376), (130, 153))]

Now we rearrange collection to create mapping between `(band, band_hash)` as key and list of indices of documents which were hashed in `band` to `band_hash`. 

*Note:* Aforementioned list is in fact $(n+1)$-tuple built according to the following logic: element of $(n+1)$-tuple with index $1$ is n itself.

In [30]:
grouped_candidates = candidates.reduceByKey(lambda line1, line2: (line1+line2)[:-1])

grouped_candidates.take(2)

[((2, 1077249376),
  (0,
   153,
   15,
   23,
   29,
   48,
   64,
   71,
   92,
   113,
   130,
   150,
   161,
   182,
   194,
   202,
   210,
   217,
   229,
   238,
   243,
   259,
   263,
   272,
   285,
   300,
   309,
   317,
   338,
   349,
   358,
   365,
   385,
   394,
   397,
   411,
   421,
   428,
   432,
   442,
   458,
   462,
   472,
   476,
   496,
   504,
   507,
   515,
   533,
   539,
   543,
   547,
   565,
   574,
   580,
   589,
   610,
   626,
   630,
   649,
   655,
   661,
   675,
   696,
   706,
   711,
   713,
   734,
   742,
   745,
   760,
   781,
   789,
   792,
   795,
   810,
   819,
   824,
   845,
   853,
   855,
   866,
   887,
   891,
   902,
   915,
   922,
   926,
   938,
   959,
   975,
   980,
   981,
   998,
   1007,
   1008,
   1013,
   1030,
   1034,
   1047,
   1048,
   1049,
   1063,
   1068,
   1076,
   1097,
   1101,
   1102,
   1105,
   1120,
   1121,
   1129,
   1142,
   1145,
   1153,
   1167,
   1173,
   1175,
   1180,
   1181,
   1

The last step in the "find candidates" procedure is to represent them in the form of **set of pairs** of documents indices. 

Iterating over all $(n+1)$-tuples in the collection, we first **remove** length elements from the $(n+1)$-tuple and **convert** it to a list, and then, using `itertools` utility, build all possible unique **couplings of elements** in the list. 

We use `set` data strtucture to keep only unique pairs (the same coupling could appear from the different $(n+1)$-tuples).

In [31]:
candidate_pairs = set()
for _, doc_tuple in tqdm(grouped_candidates.collect()):
  docs = list([doc_tuple[0]])+list(doc_tuple[2:])
  for pair in itertools.combinations(docs, 2):
    candidate_pairs.add(pair)
  

100%|██████████| 786/786 [00:00<00:00, 8822.74it/s]


##**Check Jaccard**

We need signature vectors to compute **estimated Jaccard similarity**. 

They could be found in the signature matrix, cleaned from `band` components (no division into bands is needed anymore once we found the candidates).

In [32]:
signature_vectors = signatures.map(lambda line: (line[0][0], line[1]))
signature_vectors.take(10)

[(0, 26345623),
 (0, 38050713),
 (0, 14601143),
 (0, 4072488),
 (0, 12636253),
 (0, 1374495),
 (0, 3767497),
 (0, 30256053),
 (0, 8388411),
 (0, 6990399)]

Now for convenient access to signature vector of the desired document, we create `dictionary` data structure and save to it documents in the form of `document index`-`signature vector of this document` key-value pairs. Signature vectors are `numpy` arrays.

In [33]:
sig_dict = dict()
sig_grouped = signature_vectors.reduceByKey(lambda line1, line2: str(line1)+'\n'+str(line2))
sig_sets = sig_grouped.map(lambda line: (line[0], np.array(list(map(int, line[1].split('\n'))))))
for (doc, signature) in tqdm(sig_sets.collect()):
  sig_dict[doc] = signature

100%|██████████| 1728/1728 [00:00<00:00, 491107.01it/s]


Finally, we interate over set of candidate pairs, retrieve from the dictionary **signature vectors** of candidates, compute the ratio of coincidence between them and, **comparing the ratio with a given threshold**, keep only those candidate pairs, which are above the threshold.

In [34]:
lsh_duplicates = set()
for (doc1, doc2) in tqdm(candidate_pairs):
  if (sig_dict[doc1]==sig_dict[doc2]).sum()/t >= s:
    lsh_duplicates.add((doc1,doc2))   

print("\n\nLSH has been computed within {} seconds.\n".format(round(time.time()-start_time_lsh, 3)))
print("Entire pipeline has been computed within {} seconds.\n".format(round(time.time()-start_time_pipeline, 3)))

100%|██████████| 18733/18733 [00:00<00:00, 121222.62it/s]



LSH has been computed within 5.925 seconds.

Entire pipeline has been computed within 665.455 seconds.






#**Testing**

In order to perform testing, we need to find the **nearest neighbors by comparing all the shingle sets with each other**. 

Again we need to organize an easy access to the shingle set of the desired document by document index. Therefore, by analogy with signature vectors, we create a `dictionary` with `document index`-`shingle set of this document` key-value pairs.

In [35]:
start_time_dc = time.time()
shingle_dict = dict()
shingles_grouped = shingles.reduceByKey(lambda line1, line2: line1+'\n'+line2)
shingle_sets = shingles_grouped.map(lambda line: (line[0], set(line[1].split('\n'))))
for (doc, shingle_set) in tqdm(shingle_sets.collect()):
  shingle_dict[doc] = shingle_set

100%|██████████| 1728/1728 [00:00<00:00, 620686.59it/s]


Then, knowing the number of documents in the dataset, we can use `itertools` to get **all possible couplings** between documents indices. Then, iterating over pairs of indices, we pop out corresponding **shingle sets from the dictionary**, compute **Jaccard similarity** between them and compare found quantity with the desired threshold. 

**Thresholding** operation defines duplicate documents.

In [36]:
dc_duplicates = set()
all_pairs = itertools.combinations(np.arange(txt.count()), 2)
for (doc1, doc2) in tqdm(all_pairs):
  if len(shingle_dict[doc1].intersection(shingle_dict[doc2])) / len(shingle_dict[doc1].union(shingle_dict[doc2])) >= s:
    dc_duplicates.add((doc1,doc2))

print("\n\nDirect comparison has been computed within {} seconds.\n".format(round(time.time()-start_time_dc, 3))) 

1492128it [03:53, 6385.52it/s]



Direct comparison has been computed within 239.137 seconds.






##**Time**

> Comparison of only candidates with LSH is very fast and doesn't even take `tqdm` second counter to start, meanwhile comparison of all the pairs directly could take a while: 03:53 min.

> Exact computational time of LSH part of algorithm is 5.925 sec, exact time of a direct approach is 239.137 sec (around 4 min).

> However, computation of the signature matrix at the Minwise step with sorting is time consuming: 658.217 sec (around 11 min).

> Overall time of execution of the shingling, minwise hashing, and locality-sensitive hashing is 665.455 sec.



##**Number of duplicates**

In [37]:
print("The number of duplicates found by LSH is equal to ", len(lsh_duplicates), 
      "\nThe number of duplicates found by direct comparison is equal to ", len(dc_duplicates), 
      "\nSize of intersection is equal to ", len(lsh_duplicates.intersection(dc_duplicates)))

The number of duplicates found by LSH is equal to  18449 
The number of duplicates found by direct comparison is equal to  18447 
Size of intersection is equal to  18439


##**Print Examples**

In [38]:
(doc1, doc2) = lsh_duplicates.pop()
for (description, index) in txt.collect():
  if index in [doc1, doc2]:
    print(description)

editor video con con approfondita e documentata esperienza si ricerca per posizione interna alla struttura; si richiede 1) conoscenza approfondita di premiere e after effects, basilare conoscenza di davinci, residenza entro 15 km da novara se interessati e in possesso di tutti i requisiti inviare un curriculum a job at giuseppegalliano.it e link a proprio show reel.
editor video con con approfondita e documentata esperienza si ricerca per posizione interna alla struttura; si richiede 1) conoscenza approfondita di premiere e after effects, basilare conoscenza di davinci, residenza entro 15 km da novara se interessati e in possesso di tutti i requisiti inviare un curriculum a job at giuseppegalliano.it e link a proprio show reel.


In [39]:
(doc1, doc2) = dc_duplicates.pop()
for (description, index) in txt.collect():
  if index in [doc1, doc2]:
    print(description)

editor video con con approfondita e documentata esperienza si ricerca per posizione interna alla struttura; si richiede 1) conoscenza approfondita di premiere e after effects, basilare conoscenza di davinci, residenza entro 15 km da novara se interessati e in possesso di tutti i requisiti inviare un curriculum a job at giuseppegalliano.it e link a proprio show reel.
editor video con con approfondita e documentata esperienza si ricerca per posizione interna alla struttura; si richiede 1) conoscenza approfondita di premiere e after effects, basilare conoscenza di davinci, residenza entro 15 km da novara se interessati e in possesso di tutti i requisiti inviare un curriculum a job at giuseppegalliano.it e link a proprio show reel.
