In [1]:
import pandas as pd

In [2]:
import re


def is_valid(val: str):
    pattern = r'[^a-zA-Z\s]'
    pat = re.compile(pattern)
    if pat.search(val):
        return True
    return False


a = [
    is_valid("hello"),
    is_valid("hi2"),
    is_valid("/this isfun"),
    is_valid("hel_lo"),
    is_valid("gre`n"),
]

a

[False, True, True, True, True]

In [3]:
def is_too_long(val: str, max: int = 20):
    return len(val) > max

In [4]:
import unicodedata


def is_ascii(val: str):
    for c in val:
        try:
            n = unicodedata.name(c).startswith("LATIN")
            if n:
                return True
        except ValueError:
            return False
    return False


b = [
    is_ascii("hello"),
    is_ascii("hi2"),
    is_ascii("نيقة")
]
b

[True, True, False]

In [6]:
def is_id_valid(id: str):
    return len(id) == 11


is_id_valid("-UrmKM-sTq0")

True

In [28]:
# remove rows with non ascii characters
# remove numerical rows
# remove long names
category_df = pd.read_csv('../data/collections.tsv000', sep='\t', header=0)
category_df.shape

(2396849, 4)

In [29]:
category_df['collection_title'] = category_df['collection_title'].astype(str)
category_df = category_df[~category_df['collection_title'].apply(is_too_long, args=(20,))]
category_df = category_df[~category_df['collection_title'].apply(is_valid)]
category_df['collection_title'] = category_df['collection_title'].str.lower()
category_df = category_df[category_df['collection_title'].apply(is_ascii)]
# category_df = category_df.groupby("photo_id")['collection_title'].apply(','.join).reset_index()
category_df = category_df.groupby(['photo_id', 'collection_title']).size().reset_index(name='count')
category_df.shape


(1280989, 3)

In [30]:
first_most_used_titles = category_df.groupby('photo_id').apply(
    lambda x: x.sort_values('count', ascending=False).head(1)).reset_index(drop=True)
category_df = category_df.merge(first_most_used_titles, on='photo_id', suffixes=('', '_first_most_used'))
category_df = category_df.drop(columns=['collection_title_first_most_used'])
category_df = category_df[['collection_title', 'photo_id']]
category_df.shape

(1280989, 2)

In [31]:
category_df.sample(10)

Unnamed: 0,collection_title,photo_id
989066,holiday,lpjb_UMOyx8
333346,wildlife,G0miZ5OYaXI
790590,terapias,betmVWGYcLY
270643,autumn,CqgbxNGsf6Y
594292,rds presentation,Swfc6shtGP4
900144,marine life,hYAkdFZb-Yg
137613,mystery,6LkxufmApSk
462464,beach theme,MhJJgPI7cNk
865368,koi wellbeing,fbAnIjhrOL4
584659,negative emotions,SbcqUQ4iEcI


In [37]:
tag_df = pd.read_csv('../data/keywords.tsv000', sep='\t', header=0)
tag_df.shape


(2666051, 5)

In [38]:
tag_df['keyword'] = tag_df['keyword'].astype(str)
tag_df = tag_df[~tag_df['keyword'].apply(is_too_long, args=(10,))]
tag_df = tag_df[~tag_df['keyword'].apply(is_valid)]
tag_df['keyword'] = tag_df['keyword'].str.lower()
tag_df = tag_df[tag_df['keyword'].apply(is_ascii)]
tag_df = tag_df.groupby(['photo_id', 'keyword']).size().reset_index(name='count')
tag_df.shape

(2375593, 3)

In [39]:
first_most_used_keywords = tag_df.groupby('photo_id').apply(
    lambda x: x.sort_values('count', ascending=False).head(10)).reset_index(drop=True)
tag_df = tag_df.merge(first_most_used_keywords, on='photo_id', suffixes=('', '_first_most_used'))
tag_df = tag_df.drop(columns=['keyword_first_most_used'])
tag_df = tag_df.drop_duplicates(subset=['keyword'])
tag_df = tag_df.groupby("photo_id")['keyword'].apply(','.join).reset_index()
tag_df = tag_df[['keyword', 'photo_id']]

tag_df.shape

(6995, 2)

In [40]:
tag_df.sample(20)

Unnamed: 0,keyword,photo_id
2307,"framework,spacement",CpHNKNRwXps
6855,tovel,y5vLKnZr6Zg
1907,"primordial,unusual,weird",9UKBsBpLwQo
6633,cher,vLCxJg4eWVU
4797,fish food,_U0SEppuF6Q
4328,"invitation,right hand",UvgzVZimyWU
354,gym,02T6r1rCHjM
6839,city night,xr43RescWSA
3315,stream bed,LIlG2UO71TY
5111,slops,cujzXuKmHwI


In [41]:
photo_df = pd.read_csv('../data/photos.tsv000', sep='\t', header=0)
photo_df = photo_df[['photo_id', 'photo_image_url', 'photo_description', 'ai_description']]

In [42]:
new_df = photo_df.merge(tag_df, on="photo_id", how="left")
new_df = new_df.merge(category_df, on="photo_id", how="left")
new_df = new_df[new_df['photo_id'].apply(is_id_valid)]
new_df.shape

(1281078, 6)

In [43]:
new_df.sample(10)

Unnamed: 0,photo_id,photo_image_url,photo_description,ai_description,keyword,collection_title
717480,-6JK87e42iQ,https://images.unsplash.com/photo-1546417492-5...,,aerial view photography of green body of water...,"airfield,atoll,banner,cable car,creek,lagoon,p...",the sea
1270786,o9qVQKUa3w4,https://images.unsplash.com/photo-1552863027-9...,,waterfalls in between of white rock formation,,strange
601180,kujXUuh1X0o,https://images.unsplash.com/photo-147573819823...,,silhouette photo of mountain during nighttime,,nubulea
797850,Tq4YjCa2BSc,https://images.unsplash.com/photo-142224635853...,pottery and craft,a close up of a person making something out of...,"course,mould,shaping",dosug
573230,BALZ8uogVjs,https://images.unsplash.com/photo-143122260626...,The Cabin,house on green grass field,,barn
32365,AQ9-jKmebjM,https://images.unsplash.com/photo-157858931843...,"Meandering wadis combine to form dense, branch...",an aerial view of a large body of water,"stark,wadis",earth ledger
1009420,ZVbv1akA-l4,https://images.unsplash.com/photo-143878665749...,Écosse valley and mountains,mountain covered with green grass,,myfavpix
912932,ap3LXI0fPJY,https://images.unsplash.com/photo-149849629466...,It was a beautiful and sunny day until I saw t...,closeup photo of clouds,"greecesky,lookup,whitecolor",hmmmm
703794,nSokqhD9e04,https://images.unsplash.com/photo-147497457396...,"Blue, red and yellow",three white plant pots,,schafer
874429,okVXy9tG3KY,https://images.unsplash.com/photo-150511838075...,Fuvahmulah maldives 🇲🇻,aerial photography of large body of water and ...,,cms portrait scapes


In [44]:
new_df = new_df.drop_duplicates(subset=['photo_id'])
new_df.reset_index(drop=True, inplace=True)
new_df.shape

(25000, 6)

In [45]:
new_df.sample(20)

Unnamed: 0,photo_id,photo_image_url,photo_description,ai_description,keyword,collection_title
17056,LwKfWWAdgcc,https://images.unsplash.com/photo-1556970334-8...,From sunlight gradient to a springlike pattern!,a man riding a snowboard down a snow covered s...,,abstract
21891,Ho6QQfohuYA,https://images.unsplash.com/photo-1553687616-c...,,white flower decor lot,,visual branding
9053,gQrYB3j9mJ0,https://images.unsplash.com/photo-150472275407...,Flying domestic from Newark to Milwaukee to st...,cloud formation,sas,v
2205,ePelyMHwUok,https://images.unsplash.com/photo-143229477061...,in ruins,landscape photography of forest during cloudy day,,ahista tea
5883,FahgY_bIPXg,https://images.unsplash.com/photo-1547593415-3...,,a close up of water droplets on a surface,,abstract
13376,lpi3K6TjssQ,https://images.unsplash.com/photo-1554189098-4...,,aerial photography of snow capped mountain,,aesthetic
14007,lApiQ3--c0k,https://images.unsplash.com/photo-1545504562-b...,,green leaf trees,,pathways
9181,quQe3CR6z14,https://images.unsplash.com/photo-157765636706...,,trees on hill,,background
14941,K3leeZ38kdk,https://images.unsplash.com/photo-157151182580...,A magical waterfall underpass in the summer,waterfalls,,aa
23041,C87vfR6C_aE,https://images.unsplash.com/photo-1555118370-1...,,white ceramic mug on brown wooden surface,,a
