In [1]:
import pandas as pd
from collections import Counter
import os
from textblob import TextBlob
import regex
import operator

# For bigrams creation
import nltk
from nltk import word_tokenize
from nltk.util import ngrams

The metadata file on [Google Docs](https://docs.google.com/spreadsheets/d/1YXusiepersJ6_XGoUVEE0jfGh5NJs-5Rds2_l5ZbGik/edit?usp=sharing)
    
[Here](http://cypernochkreta.dinstudio.se/text1_100.html) is a informative reference web site about the archeological excavation sites

# 0. Read in the metadata

In [2]:
def strip(text):
    try:
        return text.strip()
    except AttributeError:
        return text
    
cypern_converters = {"Fotonummer":strip,"Postnr":strip,"Nyckelord":strip,"Beskrivning":strip,"Land":strip,"foto":strip,
                     "Region, foto":strip,"Ort, foto":strip,"Geograf namn, alternativ":strip,"Fotodatum":strip,
                    "Personnamn / fotograf":strip, "Personnamn / avbildad":strip, "Sökord":strip,
                    "Händelse / var närvarande vid":strip, "Länk":strip}

cypern = pd.read_excel("excel-export.xls", sheetname="Cypern", converters=cypern_converters)

# 1. Create category/wikidata mapping tables

In [18]:
stopwords = [w.rstrip() for w in open("./stopwords.txt").readlines()]
stopwords

['i',
 'från',
 'och',
 'på',
 'in',
 'med',
 'vid',
 'mot',
 'efter',
 '1',
 'Från',
 'utanför',
 'fråga',
 'till',
 'av',
 '8',
 '14',
 '10',
 '11',
 'No',
 'en',
 'no',
 'nan',
 'rummet',
 'C',
 '18',
 '2',
 'rum',
 'I',
 '7',
 'J',
 '17',
 '16',
 '19',
 '22',
 'under',
 'fr',
 'Före',
 'hos',
 'mellan',
 'Se',
 '15',
 'T',
 '53',
 't',
 'E',
 'före',
 '5',
 '20',
 '4',
 '317',
 '9',
 'G',
 'går',
 'sitt',
 '152',
 '24',
 '501',
 '67',
 '12',
 'h',
 '117',
 'framför',
 'Z',
 '47',
 'också',
 'M',
 '93',
 '154',
 '32',
 '290',
 'över',
 '1927',
 'allt',
 '21',
 'öppnad',
 '25',
 'allo',
 '406',
 'lagad',
 'sista',
 'VI',
 'B',
 's',
 '39',
 'sig',
 'packar',
 'klippt',
 '318',
 'sen',
 'förbi',
 '56',
 'urskiljs',
 'blir',
 '50',
 'del',
 'åt',
 '170',
 '156',
 'T77',
 'sänds',
 'man',
 'kolla',
 'källan',
 'ett',
 'framgrävande',
 'stoppar',
 'bergen',
 'första',
 'Dagen',
 'det',
 'Rör',
 'T61',
 '65',
 'drar',
 'lastat',
 'L8',
 '15899D',
 '292',
 'vänster',
 '42',
 'skick',
 'här

## Column "Nyckelord"

In [4]:
list_of_strings = cypern.Nyckelord.values.astype("str")
chunk_of_strings = "" 

for string in list_of_strings:
    clean_string = regex.sub("[\"\'\.\!\?\:,;]| - ","",string) # remove .'s
    chunk_of_strings += " " + clean_string

chunk_of_strings # separated by ","

nyckelord_list = [phrase.strip() for phrase in chunk_of_strings.split(" ")]
#print(nyckelord_list[:10])

nyckelord_freq = Counter(nyckelord_list)
nyckelord_freq.most_common(20)

[('Svenska', 612),
 ('Cypernexpeditionen', 612),
 ('utgrävning', 59),
 ('utställning', 29),
 ('interiör', 29),
 ('exteriör', 23),
 ('arkeologi', 21),
 ('ritning', 19),
 ('staty', 11),
 ('porträtt', 7),
 ('gruppbild', 5),
 ('bad', 4),
 ('keramik', 3),
 ('barn', 3),
 ('bord', 3),
 ('präst', 2),
 ('kalksten', 2),
 ('tält', 2),
 ('väg', 2),
 ('kamel', 2)]

The mapping tables are pasted onto [Commons](https://commons.wikimedia.org/wiki/Commons:Medelhavsmuseet/batchUploads/Cypern_keywords)

In [5]:
header = "== Keywords from column '''Nyckelord''' (as is, separated by comma) ==\n"
header_row = """{| class="wikitable sortable" style="width: 60%; height: 200px;"
! Nyckelord
! frequency
! category
! wikidata
|-\n"""

data_rows = []
for kw, count in  nyckelord_freq.most_common(50): # original 12 stops at 4 occurances of "bad"
    nyckelord = "| " + kw + "\n"
    
    freq = "| " + str(count) + "\n"
    the_rest = "| \n| \n|-"
        
    row = nyckelord + freq + the_rest
    
    data_rows.append(row)
        
table_ending = "\n|}"
#print(data_rows)
nyckelord_wikitable = header + header_row + "\n".join(data_rows) + table_ending
print(nyckelord_wikitable)

== Keywords from column '''Nyckelord''' (as is, separated by comma) ==
{| class="wikitable sortable" style="width: 60%; height: 200px;"
! Nyckelord
! frequency
! category
! wikidata
|-
| Svenska
| 612
| 
| 
|-
| Cypernexpeditionen
| 612
| 
| 
|-
| utgrävning
| 59
| 
| 
|-
| utställning
| 29
| 
| 
|-
| interiör
| 29
| 
| 
|-
| exteriör
| 23
| 
| 
|-
| arkeologi
| 21
| 
| 
|-
| ritning
| 19
| 
| 
|-
| staty
| 11
| 
| 
|-
| porträtt
| 7
| 
| 
|-
| gruppbild
| 5
| 
| 
|-
| bad
| 4
| 
| 
|-
| keramik
| 3
| 
| 
|-
| barn
| 3
| 
| 
|-
| bord
| 3
| 
| 
|-
| präst
| 2
| 
| 
|-
| kalksten
| 2
| 
| 
|-
| tält
| 2
| 
| 
|-
| väg
| 2
| 
| 
|-
| kamel
| 2
| 
| 
|-
| kista
| 2
| 
| 
|-
| skelett
| 2
| 
| 
|-
| 
| 1
| 
| 
|-
| kranium
| 1
| 
| 
|-
| kärl
| 1
| 
| 
|-
| nan
| 1
| 
| 
|-
| relief
| 1
| 
| 
|-
| träd
| 1
| 
| 
|-
| landskap
| 1
| 
| 
|-
| huvud
| 1
| 
| 
|-
| brynja
| 1
| 
| 
|-
| tjur
| 1
| 
| 
|-
| kamera
| 1
| 
| 
|-
| karta
| 1
| 
| 
|-
| kruka
| 1
| 
| 
|-
| vattenpipa
| 1
| 
| 
|-


## Column "Beskrivning"

In [19]:
####### Unigrams ##############
clean_tokens_list = []
mega_string = ""
list_of_strings = []
for string in cypern.Beskrivning.values.astype("str"):
    mega_string += " " +string
    clean_string = regex.sub("[\"\”\'\.\!\?\:\,;]| - ","",string) # remove .'s
    #print(clean_string)
    tokens = clean_string.split(" ")
    clean_tokens = [word for word in tokens if word not in stopwords]
    #print(clean_tokens)
    for token in clean_tokens:
        clean_tokens_list.append(token)
    clean_string = " ".join(clean_tokens)
    clean_ended_string = clean_string + "." # add .'s again!

    list_of_strings.append(clean_ended_string)
    
#print("clean_tokens_list:\n{}".format(clean_tokens_list))
token_count = Counter(clean_tokens_list)
print("Token count:\n{}".format(token_count.most_common(1000)))

####### Bigrams ##############
chunk_of_strings = "" 
chunk_of_strings += list_of_strings[0]
for string in list_of_strings[1:]:
    chunk_of_strings += " " + string
#print(chunk_of_strings)


### bigrams initial approach
#blob = TextBlob(chunk_of_strings) # initiate empty blob

#bigrams = blob.ngrams(2)
#bigrams_list = []
#for wl in bigrams:
#    bigrams_list.append(" ".join(wl))

#bigrams_count = Counter(bigrams_list)
#print("Bigrams:\n{}".format(bigrams_count.most_common(50)))
#print()

### Bigrams second approach (to avoid bigrams made of end-word + first word next sentence)
token = nltk.word_tokenize(mega_string)
bigrams = ngrams(token,2)
bigrams_counter = Counter(bigrams)

clean_bigram_dict = {}
for each_tuple, freq in bigrams_counter.items():
    #print(each_tuple)
    w1, w2 = each_tuple
    forbidden_chars = set([",","."])
    if w1 in forbidden_chars or w2 in forbidden_chars:
        continue
    elif freq > 3:
        clean_bigram_dict[w1 + " " + w2] = freq
    else:
        continue
#print(type(bigrams_counter))
#print(list(bigrams_counter)[:2])
sorted_clean_bigram_dict = sorted(clean_bigram_dict.items(), key=operator.itemgetter(1), reverse=True)
print("Bigrams:\n{}".format(sorted_clean_bigram_dict))
#for bigram in sorted_clean_bigram_dict:
#    print(bigram)

# single words

Token count:
[('Vouni', 97), ('Stockholm', 55), ('Ajia', 49), ('Soli', 47), ('Irini', 47), ('Nicosia', 40), ('Westholm', 38), ('Marion', 38), ('Alfred', 35), ('Erik', 35), ('Gjerstad', 34), ('Sjöqvist', 32), ('Lapithos', 32), ('Lindros', 28), ('Amathus', 26), ('Idalion', 25), ('John', 23), ('Jakovos', 19), ('Ajios', 19), ('Liljevalchs', 19), ('Mersinaki', 19), ('Konsthall', 19), ('Einar', 19), ('Kition', 18), ('dromos', 18), ('Rum', 15), ('Kythrea', 14), ('Cholades', 13), ('Utsikt', 13), ('utgrävningen', 12), ('grav', 11), ('Volvo', 10), ('Milia', 10), ('Vivi', 10), ('Galini', 9), ('Macheras', 9), ('Oura', 9), ('', 8), ('fynd', 8), ('östra', 8), ('Fylleri', 8), ('Petra', 7), ('Panajiotis', 7), ('Souidos', 7), ('Nitovikla', 6), ('III', 6), ('Limniti', 6), ('arbetsbild', 6), ('Frv', 6), ('Enkomi', 6), ('Tou', 6), ('fyndlagret', 6), ('Ruta', 6), ('huset', 6), ('Rosa', 5), ('bild', 5), ('detalj', 5), ('nordväst', 5), ('Kristos', 5), ('Atterman', 5), ('Grottan', 5), ('Stylli', 5), ('orkestr

In [55]:
header = "== Keywords from column '''Beskrivning''' (två-ordskombinationer) ==\n"
header_row = """{| class="wikitable sortable" style="width: 60%; height: 200px;"
! Två-ordskombination
! frequency
! category
! wikidata
|-\n"""

data_rows = []
for kw, count in sorted_clean_bigram_dict: 
    nyckelord = "| " + kw + "\n"
    
    freq = "| " + str(count) + "\n"
    the_rest = "| \n| \n|-"
        
    row = nyckelord + freq + the_rest
    
    data_rows.append(row)
        
table_ending = "\n|}"
#print(data_rows)
nyckelord_wikitable = header + header_row + "\n".join(data_rows) + table_ending
print(nyckelord_wikitable)

== Keywords from column '''Beskrivning''' (två-ordskombinationer) ==
{| class="wikitable sortable" style="width: 60%; height: 200px;"
! Två-ordskombination
! frequency
! category
! wikidata
|-
| Svenska Cypernexpeditionen
| 581
| 
| 
|-
| Cypernexpeditionen Grav
| 56
| 
| 
|-
| Ajia Irini
| 47
| 
| 
|-
| Alfred Westholm
| 35
| 
| 
|-
| Erik Sjöqvist
| 32
| 
| 
|-
| in situ
| 30
| 
| 
|-
| Vouni Svenska
| 29
| 
| 
|-
| Stockholm Svenska
| 27
| 
| 
|-
| John Lindros
| 21
| 
| 
|-
| Liljevalchs Konsthall
| 19
| 
| 
|-
| Ajios Jakovos
| 19
| 
| 
|-
| på Liljevalchs
| 19
| 
| 
|-
| Utställning på
| 19
| 
| 
|-
| Cypernexpeditionen Teatern
| 18
| 
| 
|-
| Einar Gjerstad
| 17
| 
| 
|-
| Cypernexpeditionen Erik
| 16
| 
| 
|-
| Cypernexpeditionen Utställning
| 15
| 
| 
|-
| Soli Svenska
| 15
| 
| 
|-
| Cypernexpeditionen Arbetsbild
| 14
| 
| 
|-
| Cypernexpeditionen Cholades
| 12
| 
| 
|-
| Cypernexpeditionen Amathus
| 12
| 
| 
|-
| Cypernexpeditionen Lapithos
| 12
| 
| 
|-
| Cypernexpeditionen

In [20]:
header = "== Keywords from column '''Beskrivning''' (word-by-word) ==\n"
header_row = """{| class="wikitable sortable" style="width: 60%; height: 200px;"
! Ord
! frequency
! category
! wikidata
|-\n"""

data_rows = []
for kw, count in token_count.most_common(500): # original 50 stops at 3 occurances of "H"
    nyckelord = "| " + kw + "\n"
    
    freq = "| " + str(count) + "\n"
    the_rest = "| \n| \n|-"
        
    row = nyckelord + freq + the_rest
    
    data_rows.append(row)
        
table_ending = "\n|}"
#print(data_rows)
nyckelord_wikitable = header + header_row + "\n".join(data_rows) + table_ending
print(nyckelord_wikitable)

== Keywords from column '''Beskrivning''' (word-by-word) ==
{| class="wikitable sortable" style="width: 60%; height: 200px;"
! Ord
! frequency
! category
! wikidata
|-
| Vouni
| 97
| 
| 
|-
| Stockholm
| 55
| 
| 
|-
| Ajia
| 49
| 
| 
|-
| Soli
| 47
| 
| 
|-
| Irini
| 47
| 
| 
|-
| Nicosia
| 40
| 
| 
|-
| Westholm
| 38
| 
| 
|-
| Marion
| 38
| 
| 
|-
| Alfred
| 35
| 
| 
|-
| Erik
| 35
| 
| 
|-
| Gjerstad
| 34
| 
| 
|-
| Sjöqvist
| 32
| 
| 
|-
| Lapithos
| 32
| 
| 
|-
| Lindros
| 28
| 
| 
|-
| Amathus
| 26
| 
| 
|-
| Idalion
| 25
| 
| 
|-
| John
| 23
| 
| 
|-
| Jakovos
| 19
| 
| 
|-
| Ajios
| 19
| 
| 
|-
| Liljevalchs
| 19
| 
| 
|-
| Mersinaki
| 19
| 
| 
|-
| Konsthall
| 19
| 
| 
|-
| Einar
| 19
| 
| 
|-
| Kition
| 18
| 
| 
|-
| dromos
| 18
| 
| 
|-
| Rum
| 15
| 
| 
|-
| Kythrea
| 14
| 
| 
|-
| Cholades
| 13
| 
| 
|-
| Utsikt
| 13
| 
| 
|-
| utgrävningen
| 12
| 
| 
|-
| grav
| 11
| 
| 
|-
| Volvo
| 10
| 
| 
|-
| Milia
| 10
| 
| 
|-
| Vivi
| 10
| 
| 
|-
| Galini
| 9
| 
| 
|-
| Macheras
| 

## Merge keywords from columns Nyckelord and Beskrivning

In [9]:
tot_dict = {}
for bigram, count in bigrams_count.most_common(100):
    tot_dict[bigram] = count
for unigram, count in token_count.most_common(100):
    tot_dict[unigram] = count
sorted_tot_dict = sorted(tot_dict.items(), key=operator.itemgetter(1), reverse=True)
sorted_tot_dict

[('Vouni', 97),
 ('Stockholm', 55),
 ('Ajia', 49),
 ('Irini', 47),
 ('Soli', 47),
 ('Ajia Irini', 47),
 ('Nicosia', 40),
 ('Marion', 38),
 ('Westholm', 38),
 ('Erik', 35),
 ('Alfred Westholm', 35),
 ('Alfred', 35),
 ('Gjerstad', 34),
 ('Sjöqvist', 32),
 ('Erik Sjöqvist', 32),
 ('Lapithos', 32),
 ('Lindros', 28),
 ('Amathus', 26),
 ('Idalion', 25),
 ('John', 23),
 ('John Lindros', 21),
 ('Liljevalchs Konsthall', 19),
 ('Liljevalchs', 19),
 ('Mersinaki', 19),
 ('Einar', 19),
 ('Ajios', 19),
 ('Jakovos', 19),
 ('Ajios Jakovos', 19),
 ('Irini Ajia', 19),
 ('Konsthall', 19),
 ('Kition', 18),
 ('dromos', 18),
 ('Einar Gjerstad', 17),
 ('Rum', 15),
 ('Kythrea', 14),
 ('Utsikt', 13),
 ('Cholades', 13),
 ('utgrävningen', 12),
 ('Marion Marion', 12),
 ('Vouni Vouni', 11),
 ('grav', 11),
 ('Vivi', 10),
 ('Volvo', 10),
 ('Milia', 10),
 ('Soli Cholades', 9),
 ('Galini', 9),
 ('Macheras', 9),
 ('Vivi Gjerstad', 9),
 ('Oura', 9),
 ('Stockholm Liljevalchs', 9),
 ('Konsthall Liljevalchs', 9),
 ('', 8),

# 2. Create new filenames for the images

In [10]:
def save_filename_to_filename_file(filname_file, filename):
    """Create a file mapping original filenames and their folders with new
    Commons filenames according to <Task X on Phabricator>"""
    folder = row["Folder"]
    file = row["Filename"]
    # Filename: <Filename_1_clean>_-_DecArch_-_<Folder_#>-<Filename_0_clean>.<ext>
    
    #print("filename: {}".format(filename))
    filenames_file.write("{}|{}|{}\n".format(row["Folder"],row["Filename"],filename))

In [11]:
def get_foldernames_and_filenames(inpath):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-11-04e94e1c82c0>, line 2)

In [None]:
def create_filenames(fold, foldobj):
    

In [None]:
filenames_file = open("./filenames_mapping.csv","w")
filenames_file.write("Folder|Original|Commons\n")

for row_index, row in cypern.iterrows():
    filename = create_filename(row["Folder"], row["Filename"])
    save_filename_to_filename_file(filenames_file, filename)

Infobox mapping is available on [Phabricator](https://phabricator.wikimedia.org/T144485)

# Cypern-samlingen

In [None]:
filenames_dict = {}
for index, row in cypern.iterrows():
    