In [1]:
import numpy as np
import pandas as pd

from PIL import Image
from io import BytesIO
import base64

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyRegressor

import pickle

In [2]:
test = pd.read_csv("test.csv")

In [3]:
test["text"] = test["text"].fillna("")

In [4]:
with open('text_vectorizer.pickle', 'rb') as f:
    text_vectorizer = pickle.load(f)

In [5]:
def img_vectorizer(photo_base64):        
    img = np.array(Image.open(BytesIO(base64.b64decode(photo_base64))))    
    
    s = img.shape
    if (len(s) == 2):
        img = np.repeat(img, 3)
        
    h, w = s[0], s[1]
    img.resize((h * w, 3))    
    
    stats = []
    stats.append(np.array([h,w]))
    stats.append(img.min(axis=0))
    stats.append(img.max(axis=0))
    stats.append(img.mean(axis=0))
    stats.append(img.std(axis=0))
    stats.append(np.median(img, axis=0))
    cm = np.corrcoef(img.T)
    stats.append(cm[np.triu_indices(len(cm), k = 1)])
    return np.concatenate(stats) 

In [6]:
X_img = np.vstack(test['photo'].map(img_vectorizer))

In [7]:
X_text = text_vectorizer.fit_transform(test['text']).toarray()

In [8]:
X = np.hstack([X_text, X_img])

In [9]:
target = ["like", "comment", "hide", "expand", "open_photo", "open", "share_to_message"]

In [10]:
prediction = pd.DataFrame()

In [11]:
with open('model.pickle', 'rb') as f:
    model = pickle.load(f)

In [12]:
for column in target:
    reg = model[column]
    y = reg.predict(X)
    prediction[column] = y * test['view'] 


In [13]:
prediction

Unnamed: 0,like,comment,hide,expand,open_photo,open,share_to_message
0,50.553797,1.510173,1.222241,73.408697,84.534582,52.507750,5.198158
1,54.720973,1.634657,1.322991,79.459815,91.502812,56.835991,5.626645
2,353.733706,10.566942,8.552234,513.653414,591.503166,367.405852,36.372418
3,82.176710,2.454831,1.986789,119.328033,137.413492,85.352918,8.449762
4,181.450747,5.420404,4.386942,263.482936,303.416636,188.463992,18.657545
...,...,...,...,...,...,...,...
3522,180.355375,5.387682,4.360459,261.892357,301.584987,187.326282,18.544914
3523,158.328873,4.729693,3.827923,229.907880,264.752915,164.448436,16.280054
3524,36.790210,1.099019,0.889478,53.422721,61.519515,38.212187,3.782927
3525,118.395422,3.536778,2.862451,171.920887,197.977364,122.971518,12.173925


In [14]:
prediction.to_csv("submission.csv", index=False)