In [2]:
# Load deps

# general-purpose imports
import feather # may need to put this in your .bashrc: export MACOSX_DEPLOYMENT_TARGET=10.10
import numpy as np
import os
import pandas as pd
import requests
import matplotlib.pyplot as plt

# ML deps
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Imputer

# Image processing deps
from PIL import Image
from StringIO import StringIO

%matplotlib inline

# Control those annoying warnings
import warnings
warnings.filterwarnings('ignore')

# 1. Creating features from the listing images

In [3]:
df = pd.read_json("train.json")

In [18]:
# References: http://www.racketracer.com/2016/07/06/pandas-in-parallel/
import pandas as pd
import numpy as np
import seaborn as sns
import uuid
from multiprocessing import Pool

num_partitions = 250 #number of partitions to split dataframe
num_cores = 7 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def append_image_features(data):
    img_features = data['photos'].map(lambda photo_album: get_image_features(photo_album))
    img_df = pd.DataFrame({
            'mean_red': np.array([feature_dict['mean_red'] for feature_dict in img_features]),
            'mean_green': np.array([feature_dict['mean_green'] for feature_dict in img_features]),
            'mean_blue': np.array([feature_dict['mean_blue'] for feature_dict in img_features]),
            'std_red': np.array([feature_dict['std_red'] for feature_dict in img_features]),
            'std_green': np.array([feature_dict['std_green'] for feature_dict in img_features]),
            'std_blue': np.array([feature_dict['std_blue'] for feature_dict in img_features]),
            'img_resolution': np.array([feature_dict['img_resolution'] for feature_dict in img_features])
        })
    return img_df

def get_image_features(photo_url_list):
    """
    Create one row of features for a collection of
    images.
    """
    
    # Write a temp file to disk to track progress
    fname = '/Users/jlamb/repos/sandbox/tmp/' + str(uuid.uuid1())
    with open(fname, 'w') as f:
        f.write('x')
    
    if len(photo_url_list) > 0:
        
        try: 
            # Set up collectors
            mean_red = []
            mean_green = []
            mean_blue = []
            std_red = []
            std_green = []
            std_blue = []
            img_resolution = []

            # TESTING: Just use first image for now
            photo_url_list = [photo_url_list[0]]
            for url in photo_url_list:

                # Get photo (http://stackoverflow.com/questions/7391945/how-do-i-read-image-data-from-a-url-in-python)
                url = url
                response = requests.get(url)
                img = np.array(Image.open(StringIO(response.content)))

                # Mean value by channel
                mean_red.append(img[:,0].mean())
                mean_green.append(img[:,1].mean())
                mean_blue.append(img[:,2].mean())

                # standard deviation by channel
                std_red.append(img[:,0].std())
                std_green.append(img[:,1].std())
                std_blue.append(img[:,2].std())

                # resolution (num pixels)
                img_resolution.append(img.size)

            # Summarize 
            out_dict = {
                'mean_red': np.mean(np.array(mean_red)),
                'mean_green': np.mean(np.array(mean_green)),
                'mean_blue': np.mean(np.array(mean_blue)),
                'std_red': np.mean(np.array(std_red)),
                'std_green': np.mean(np.array(std_green)),
                'std_blue': np.mean(np.array(std_blue)),
                'img_resolution': np.mean(np.array(img_resolution))
            }
            
        except:
            
            out_dict = {
                'mean_red': float('nan'),
                'mean_green': float('nan'),
                'mean_blue': float('nan'),
                'std_red': float('nan'),
                'std_green': float('nan'),
                'std_blue': float('nan'),
                'img_resolution': float('nan')
            }
            
        
    else:
    
        out_dict = {
            'mean_red': float('nan'),
            'mean_green': float('nan'),
            'mean_blue': float('nan'),
            'std_red': float('nan'),
            'std_green': float('nan'),
            'std_blue': float('nan'),
            'img_resolution': float('nan')
        }
        
    return(out_dict)

X_full = df
x_full_with_images = parallelize_dataframe(X_full, append_image_features)

In [64]:
# Write to disk
feather.write_dataframe(x_full_with_images, 'img_df.feather')

# Concatenate into a single DF
train_df = pd.concat([X_full.reset_index(), x_full_with_images], axis=1)