# Imports

In [52]:
import os
import csv
import bz2
import time
import re
import pandas as pd
import numpy as np
from io import StringIO
import seaborn as sns
import re

# Silence warnings
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

In [2]:
# SPARQL

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkFiles

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Extract bz2

# Main dataset

In [3]:
df_full = spark.read.format("csv").option("delimiter", "\t").load("yfcc100m_dataset.csv")

In [4]:
col_names = ['index', 'data_identifier', 'unknown_1', 'user_identifier',
              'username', 'date', 'computer_time', 'camera', 
              'photo_filename', 'description', 'user_tags', 'external_upload_mean',
              'longitude', 'latitude', 'unknown_2', 'flickr_url', 
              'image_direct_url', 'license_name', 'license_doc', 'flickr_photo_pool',
              'unknown_3', 'image_identifier_2', 'image_identifier_1', 'format', 'unknown_bool']

In [5]:
for i in range(len(col_names)):
    df_full = df_full.withColumnRenamed('_c' + str(i), col_names[i])

In [6]:
df_full.printSchema()

root
 |-- index: string (nullable = true)
 |-- unknown_identifier: string (nullable = true)
 |-- unknown_1: string (nullable = true)
 |-- user_identifier: string (nullable = true)
 |-- username: string (nullable = true)
 |-- date: string (nullable = true)
 |-- computer_time: string (nullable = true)
 |-- camera: string (nullable = true)
 |-- photo_filename: string (nullable = true)
 |-- description: string (nullable = true)
 |-- user_tags: string (nullable = true)
 |-- external_upload_mean: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- unknown_2: string (nullable = true)
 |-- flickr_url: string (nullable = true)
 |-- image_direct_url: string (nullable = true)
 |-- license_name: string (nullable = true)
 |-- license_doc: string (nullable = true)
 |-- flickr_photo_pool: string (nullable = true)
 |-- unknown_3: string (nullable = true)
 |-- image_identifier_2: string (nullable = true)
 |-- image_identifier_1: string (nullabl

In [7]:
df_full.show(5)

+-----+------------------+--------------------+---------------+-------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+---------+---------+--------------------+--------------------+--------------------+--------------------+-----------------+---------+------------------+------------------+------+------------+
|index|unknown_identifier|           unknown_1|user_identifier|     username|                date|computer_time|              camera|      photo_filename|         description|           user_tags|external_upload_mean| longitude| latitude|unknown_2|          flickr_url|    image_direct_url|        license_name|         license_doc|flickr_photo_pool|unknown_3|image_identifier_2|image_identifier_1|format|unknown_bool|
+-----+------------------+--------------------+---------------+-------------+--------------------+-------------+--------------------+--------------------+--------

In [8]:
n = 10
for i in range(n):
    print(df_full.take(n)[i]['flickr_url'])

http://www.flickr.com/photos/39089491@N00/6985418911/
http://www.flickr.com/photos/55289779@N00/10201275523/
http://www.flickr.com/photos/8134076@N03/2297552664/
http://www.flickr.com/photos/53430201@N03/7289030198/
http://www.flickr.com/photos/12940050@N02/4140939180/
http://www.flickr.com/photos/80547277@N00/9506922316/
http://www.flickr.com/photos/42132616@N04/4436463882/
http://www.flickr.com/photos/78969707@N00/4572998878/
http://www.flickr.com/photos/97468058@N00/9329902958/
http://www.flickr.com/photos/36813788@N00/3174965401/


In [10]:
count = df_full.count()
count

KeyboardInterrupt: 

In [9]:
df.createOrReplaceTempView("dataset")
list_ = spark.sql("SELECT index, user_tags FROM dataset")
list_.show()

+-----+--------------------+
|index|           user_tags|
+-----+--------------------+
|    0|canon,canon+power...|
|    1|                null|
|    2|                null|
|    3|blip,blip12ny,bli...|
|    4|2009,amsterdam,cl...|
|    5|ben,cdc,christchu...|
|    6|                null|
|    7|  diana,matt,wedding|
|    8|bantockpark,fayre...|
|    9|      2009,new,years|
|   10|digikam,e.phelt,k...|
|   11|                null|
|   12|arches+national+p...|
|   13|adelboden,neige,s...|
|   14|                null|
|   15|cech,cs,on+tour,p...|
|   16|                null|
|   17|         appart,chat|
|   18|%D8%A2%D8%AB%D8%A...|
|   19|                null|
+-----+--------------------+
only showing top 20 rows



In [14]:
min_words_list_ = ['minimalis', 'минимализм', '极简主义', 'ミニマリズム', '미니멀리즘']

In [23]:
#filter rows that contain a tag that starts with 'minimalis'
list_ = list_.filter(col('user_tags').rlike('(minimalis)|(минимализм)|(极简主义)|(ミニマリズム)|(미니멀리즘)'))
list_.show()

+------+--------------------+
| index|           user_tags|
+------+--------------------+
| 13558|afternoon,almost+...|
| 34553|0xb69d80,cape+ten...|
| 37075|a.schacht+90mm+2....|
| 65415|ambience,appearan...|
| 67744|abstract,beach,cl...|
| 94518|b%C3%B6,boris,gam...|
|108606|d80,door+light,mi...|
|120211|ambience,appearan...|
|134120|action,american,a...|
|160303|concrete,library,...|
|170802|animation,barefoo...|
|171536|black,black+and+w...|
|178141|china,d700,digita...|
|194567|abstraction,candl...|
|197353|berlin,minimal,mi...|
|197893|architecture,cali...|
|207656|chocolate+and+ste...|
|234840|black+and+white,m...|
|250039| minimalism,swallows|
|264985|18thstreet,a6,arr...|
+------+--------------------+
only showing top 20 rows



In [22]:
list_.count()

4356

In [25]:
df_min_ = df_full.join(list_, ["user_tags"], "inner")

In [27]:
df_min_.show()

+--------------------+--------+------------------+--------------------+---------------+-----------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+---------+---------+---------+--------------------+--------------------+--------------------+--------------------+-----------------+---------+------------------+------------------+------+------------+--------+
|           user_tags|   index|unknown_identifier|           unknown_1|user_identifier|         username|                date|computer_time|              camera|      photo_filename|         description|external_upload_mean|longitude| latitude|unknown_2|          flickr_url|    image_direct_url|        license_name|         license_doc|flickr_photo_pool|unknown_3|image_identifier_2|image_identifier_1|format|unknown_bool|   index|
+--------------------+--------+------------------+--------------------+---------------+-----------------+--------------------+------

In [29]:
#df_min_.toPandas().to_csv('data/minimalis_tag_dataset.csv')
df_min_.write.format("csv").option("delimiter", "\t").save('data/minimalis_tag_dataset.csv')

# Autotags dataset

In [6]:
df = spark.read.format("csv").option("delimiter", "\t").load("yfcc100m_autotags.csv")

In [7]:
col_names = ['data_identifier', 'autotags']

In [8]:
for i in range(len(col_names)):
    df = df.withColumnRenamed('_c' + str(i), col_names[i])

In [9]:
df.show(1)

+---------------+--------------------+
|data_identifier|            autotags|
+---------------+--------------------+
|     6985418911|atoll:0.5220,bay:...|
+---------------+--------------------+
only showing top 1 row



In [10]:
df.createOrReplaceTempView("dataset")
list_ = spark.sql("SELECT data_identifier, autotags FROM dataset")

In [11]:
list_ = list_.filter(col('autotags').rlike('(minimalis)|(минимализм)|(极简主义)|(ミニマリズム)|(미니멀리즘)'))

In [12]:
list_.show(5)

+---------------+--------------------+
|data_identifier|            autotags|
+---------------+--------------------+
|     6295196910|abstract:0.9150,a...|
|     5392941214|abstract:0.6880,a...|
|     2418950445|abstract:0.9080,a...|
|     2397482849|art:0.5260,curve:...|
|     8274058040|aircraft:0.5340,a...|
+---------------+--------------------+
only showing top 5 rows



In [13]:
df_min_autotags = df_full.join(list_, ["data_identifier"], "inner")

In [16]:
df_min_autotags.write.format("csv").option("delimiter", "\t").save('data/minimalis_autotag_subset.csv')

## Check and merge csv files

In [88]:
def keep_high_proba(df):
    
    df_ = df.copy()
    list_ = []
    for ind, item in df['autotags'].iteritems():
        val = re.findall('(?<=(minimalism:0.))([0-9]+)', item)[0][1][0]
        if int(val) >= 6:
            list_.append(ind)
    
    df_ = df_.iloc[list_]
    return list_

In [34]:
DATA = 'data/minimalis_autotag_subset.csv/'
headers = ['index', 'data_identifier', 'unknown_1', 'user_identifier', 'username', 'date', 
           'computer_time', 'camera', 'photo_filename', 'description', 'user_tags', 'external_upload_mean',
           'longitude', 'latitude', 'unknown_2', 'flickr_url', 'image_direct_url', 'license_name', 
           'license_doc', 'flickr_photo_pool', 'unknown_3', 'image_identifier_2', 'image_identifier_1', 
           'format', 'unknown_bool', 'autotags']

folder = os.listdir(DATA)  

In [43]:
folder_ = []
for file in folder:
    if (file[0] != '.') & (file[0] != '_'):
        folder_.append(file)
folder = folder_

In [91]:
df_autotags = pd.DataFrame({})
for file in folder:
    df_ = pd.read_csv(DATA + file, sep = '\t', names = headers)
    df_autotags = pd.concat([df_autotags, df_])
    df_autotags = df_autotags.drop_duplicates(subset = 'index')
    print('.', end = '') 

........................................................................................................................................................................................................

In [92]:
keep_columns = ['index', 'user_identifier', 'description', 'date', 'longitude', 'latitude',
                'external_upload_mean', 'camera', 'license_name', 'flickr_url', 'image_direct_url',
                'user_tags', 'autotags']

In [93]:
df_autotags = df_autotags[keep_columns]

In [95]:
df_autotags.head(1)

Unnamed: 0,index,user_identifier,description,date,longitude,latitude,external_upload_mean,camera,license_name,flickr_url,image_direct_url,user_tags,autotags
0,10000979206,66172503@N00,,2013-09-30 00:26:09.0,14.499968,46.053249,"foursquare%3Avenue%3D4e2b1e5a315185636a08f0dd,...",,Attribution License,http://www.flickr.com/photos/66172503@N00/1000...,http://farm4.staticflickr.com/3702/10000979206...,"instagram+app,iphoneography,nashville,square,s...","abstract:0.8090,architecture:0.9030,bright:0.6..."


In [97]:
df_tags = pd.read_csv('data/minimalis_tag_dataset.csv').drop(columns = 'Unnamed: 0')
df_tags['autotags'] = np.nan
df_tags = df_tags[keep_columns]

In [99]:
df = pd.concat([df_autotags, df_tags])
df = df.drop_duplicates(subset = 'index')

In [100]:
len(df)

1032133

In [101]:
df.to_csv('data/corpus.csv')