In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import shutil
import os

In [2]:
metadata_df = pd.read_csv("Image_Data/metadata.csv")

In [3]:
metadata_df.head(5)


Unnamed: 0,Record ID,File Name,Assignment ID,Year,Person,Keywords,Creators,Date
0,207431327,HearstTransportation_CAMP_MontrealTeam.png,CPR6931p,,Unidentified,"People @ Work,Hearst transportation,Transporta...",,
1,207431332,HearstTransportation_MicrosoftTeams_ZoomMeetin...,CPR6931p,,"Unidentified,Don Dickson,Danya Bynoe,Varun Kum...","People @ Work,Hearst transportation,Transporta...",,
2,207431348,HearstTransportation_BlackBook_LauraWehunt_1.png,CPR6931p,,Laura Wehunt,"People @ Work,Hearst transportation,Headshots,...",,
3,207431356,HearstTransportation_BlackBook_LauraWehunt_2.png,CPR6931p,,Laura Wehunt,"People @ Work,Hearst transportation,Headshots,...",,
4,207432753,HearstMagazines_CDS_Employee.png,CPR6931p,,Bryan Phillips,"People @ Work,CDS,Hearst Magazines,Hearst maga...",,


In [4]:
metadata_df.groupby(by = ['Keywords']).agg('count').sort_values(by = ['Record ID'], ascending=False).head(30)

Unnamed: 0_level_0,Record ID,File Name,Assignment ID,Year,Person,Creators,Date
Keywords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"People @ Work,Homecoming,2013,Events,44th Floor,The Long Gray Line",443,443,443,443,416,0,0
"People @ Work,2018,Events,Friday,Texas,Texas 2018,Retreat",258,258,258,258,186,162,0
"Archdiocese of New York,Corporate,Al Smith Dinner 2021,2021,Al Smith Dinner,Alfred E. Smith Memorial Foundation,Alfred E. Smith Memorial Foundation Dinner,Charities,Executives,Events,Dinners,Catholic charities",230,230,0,230,230,230,0
"People @ Work,Spotlight 2016,Awards,Hearst Magazines,2016,Events,Spotlight,Magazines,44th Floor",140,140,0,140,129,1,0
"People @ Work,Frank A. Bennack,Frank Bennack,Events,FAB,2013,Homecoming,44th Floor,The Long Gray Line",127,127,127,127,127,68,0
"People @ Work,2018,Thursday,Events,Texas,Texas 2018,Retreat",114,114,114,114,109,80,0
"People @ Work,Tower Gala,Events,Hearst Tower",111,111,111,111,105,0,0
"People @ Work,2019,Events,Friday,Texas,Texas 2019",98,98,98,98,91,98,0
"People @ Work,Texas 2016,2016,Friday,Events,Texas",97,97,97,97,42,97,0
"People @ Work,Texas 2017,2017,Events,Friday,Texas",96,96,96,96,69,96,0


## Feature Engineering

#### Split on comma and find individual keywords

In [5]:
clean_df = metadata_df.copy()

In [6]:
clean_df = clean_df.join(clean_df['Keywords'].str.get_dummies(','))

In [7]:
tag_df = pd.DataFrame(clean_df.iloc[:,7:].sum(axis = 0), columns = ['Count']).sort_values(by = ['Count'], ascending = False)

In [8]:
tag_df['Tag_Name'] = tag_df.index
tag_df.reset_index(drop = True, inplace = True)

In [9]:
### Further steps
tag_df['Count_tag_appears'] = tag_df['Count']
tag_df = tag_df.iloc[:,1:]

In [10]:
# Creating five broad categories
time_list = ['Friday','Thursday','2019','2018','2016','2021','2017','2013']
tag_df.head()

Unnamed: 0,Tag_Name,Count_tag_appears
0,People @ Work,7058
1,Events,6483
2,44th Floor,2993
3,Texas,2090
4,Hearst Magazines,1611


In [11]:
tag_df[tag_df['Count_tag_appears']>100].count()

Tag_Name             116
Count_tag_appears    116
dtype: int64

#### Finding A Subset of Tags
* There are 116 tags that have a count of atleast 100 in the dataset
*Find all tags that have a count of atleast 100. Set this as an assorted list of tags
*Check in clean_df the count of images where any one tag appears which is in the assorted list of tags
*Remove year of picture From this tag list.
*Ultimately reduce tags created as dummy features in original dataset to reduce dimensionality

In [12]:
tag_list = list(tag_df[tag_df['Count_tag_appears']>140]['Tag_Name'])

In [13]:
#Count of images that can be excluded cannot be more than 0.5% of the dataset size. 0.5% of 9300 images roughly is 46.5. On Trial and Error a cutoff of 140 tags is found.
clean_df[(clean_df[tag_list].any(axis = 1))==False].head()

Unnamed: 0,Record ID,File Name,Assignment ID,Year,Person,Keywords,Creators,Date,Washington Week,#HearstElevatorSelfie,...,volunteering,weather,websites,woman,women's history month,workout,Gil Maurer,Headshots,People @ Work,Portraits
54,207993765,CPR7566_HTV_2020_YearEnder1.png,CPR7566,,,"Hearst Gives Back,Hearst Television,2020,Year ...",Hearst Television,,0,0,...,0,0,0,0,0,0,0,0,0,0
55,207993767,CPR7566_HTV_2020_YearEnder6.png,CPR7566,,Michael Armstrong,"Hearst Gives Back,Hearst Television,Year Ender...",Hearst Television,,0,0,...,0,0,0,0,0,0,0,0,0,0
56,207993773,CPR7566_HTV_2020_YearEnder5.png,CPR7566,,Unidentified,"Year Ender Video,Hearst Television,Wildfires,N...",Hearst Television,,0,0,...,0,0,0,0,0,0,0,0,0,0
57,207993776,CPR7566_HoustonChron_TogetherWeAreStrong.png,CPR7566,,,"Hearst Gives Back,Hearst Newspapers,Natural Di...",,,0,0,...,0,0,0,0,0,0,0,0,0,0
58,207993778,CPR7566_HTV_2020_YearEnder7.png,CPR7566,,,"Hearst Gives Back,Hearst Television,2020,Year ...",Hearst Television,,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Number of features to be used in dataset for training before feature importance is taken
len(tag_list)+ metadata_df.shape[1] - 1

103

In [15]:
image_df_original = clean_df.iloc[:,[0,1,2,3,4,6,7]]

In [16]:
image_df_original = image_df_original.join(clean_df[tag_list])

In [17]:
#Removing Duplicate Tags/Features(found due to case sensitivity) After Setting Original Tag to 1.
print('Before Duplication Hearst Magazines:', image_df_original['Hearst Magazines'].sum())
print('Before Duplication Magazines' ,image_df_original['Magazines'].sum())
print('Hearst magazines count' , image_df_original['Hearst magazines'].sum())
print('magazines count' , image_df_original['magazines'].sum())

image_df_original.loc[image_df_original['Hearst magazines']==1,'Hearst Magazines'] = 1
image_df_original.loc[image_df_original['magazines']==1,'Magazines'] = 1

print('After Duplication handling Hearst Magazines:' ,image_df_original['Hearst Magazines'].sum())
print('After Duplication handling Magazines' ,image_df_original['Magazines'].sum())

Before Duplication Hearst Magazines: 1611
Before Duplication Magazines 1494
Hearst magazines count 532
magazines count 456
After Duplication handling Hearst Magazines: 1611
After Duplication handling Magazines 1612


In [18]:
image_df_original.drop(labels=['Hearst magazines','magazines'], axis=1, inplace = True)
tag_list = [x for x in tag_list if x != 'Hearst magazines']
tag_list = [x for x in tag_list if x != 'magazines']


In [19]:

#image_df_original['target'] = image_df_original.iloc[:,7:].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)


In [20]:
df =pd.DataFrame(image_df_original[image_df_original['People @ Work']==1].iloc[:,8:].sum(axis = 0))
df.columns = ['counts']
df = df.reset_index()
df.head()

Unnamed: 0,index,counts
0,Events,5896
1,44th Floor,2989
2,Texas,2075
3,Hearst Magazines,1531
4,Magazines,1531


In [21]:
type(df['counts'].iloc[1])

numpy.int64

In [22]:
df.sort_values(by=['counts'],ascending =False).head()

Unnamed: 0,index,counts
0,Events,5896
1,44th Floor,2989
2,Texas,2075
3,Hearst Magazines,1531
4,Magazines,1531


In [23]:
#len(image_df_original['target'].unique())

## ETL For Final Dataset for Folder Creation

#### Write separate function or a combination of functions which follows the following steps:
1. Create folder for a tag in tag_list
2. filter images in clean_df for which contain that tag in the keyword
3. iterate through each file name in the clean_df. Concatenate with source parth and set it as src_dir
4. set destination dir as the folder created in step 1.

In [24]:
# #Create Directory using OS

# main_dir = "C:/Examples/Python_files/OS_module"
 
# os.makedirs(main_dir,mode = 0o666) 
# print("Directory '% s' is built!" % main_dir) 

# #copy image from one folder to another in python
# src_dir = "your/source/dir"
# dst_dir = "your/destination/dir"
# for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
#     shutil.copy(jpgfile, dst_dir)

##### Step 1 : Defining a function to Create a Folder for a tag list.

In [25]:
image_df_original.head(2)

Unnamed: 0,Record ID,File Name,Assignment ID,Year,Person,Creators,Date,People @ Work,Events,44th Floor,...,HearstLab,Kate Lewis,David Carey,Colin Powell,Norman Foster,Culture Shift Labs,Hearst Health,Gathering,Horseback Riding,Ellen Levine
0,207431327,HearstTransportation_CAMP_MontrealTeam.png,CPR6931p,,Unidentified,,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,207431332,HearstTransportation_MicrosoftTeams_ZoomMeetin...,CPR6931p,,"Unidentified,Don Dickson,Danya Bynoe,Varun Kum...",,,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df = image_df_original.iloc[:,:8].rename(columns={'People @ Work':'isPAW'}).copy()
df.head(2)

Unnamed: 0,Record ID,File Name,Assignment ID,Year,Person,Creators,Date,isPAW
0,207431327,HearstTransportation_CAMP_MontrealTeam.png,CPR6931p,,Unidentified,,,1
1,207431332,HearstTransportation_MicrosoftTeams_ZoomMeetin...,CPR6931p,,"Unidentified,Don Dickson,Danya Bynoe,Varun Kum...",,,1


In [30]:
tags = ['Others','PAW']

In [31]:
def create_dir(image_df,tag_list):

## create folders
    for c,tag in enumerate(tag_list):
        main_dir = "Image_Data/"+str(tag)
        
        if not os.path.exists(main_dir):
            os.makedirs(main_dir,mode = 0o666) 
            print("Directory '% s' is built!" % main_dir) 

## filter dataframe on current tag to obtain image file list
        file_name_list = list(image_df[image_df['isPAW']==c]['Record ID'])
        #print(image_df[image_df['isPAW']==c])
    
## iterate through the file name list to set source and destination path based on current tag and copy from source to destination
        for i in file_name_list:
            #print(i)
            src_dir = "Image_Data/images/"+str(i)+".jpg"
            if os.path.exists(src_dir):
                shutil.copy(src_dir, main_dir)

In [32]:
create_dir(df,tags)

Directory 'Image_Data/Others' is built!


PermissionError: [Errno 13] Permission denied: 'Image_Data/Others/207993765.jpg'

##### Step 2 : filter images in clean_df for which contain that tag in the keyword

In [None]:
file_name_list = list(image_df_original[image_df_original[tag_list[0]]==1]['Record ID'].str.)

In [None]:
file_name_list

In [None]:
# src_dir = "C:/Users/abhij/images/"+str(file_name_list[0])+".jpg"
# dest_dir = "C:/Users/abhij"
# #shutil.copy(src_dir,dest_dir)

In [None]:
image_df_original