# Feature Analysis of Website Imagery Content

## Imports

In [15]:
import numpy as pd
import pandas as pd
import pprint
import os
import pymongo
import random
import re
import nltk
from nltk.corpus import stopwords
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup
from pymongo import MongoClient
from PIL import Image
from gridfs import GridFS
from sklearn.ensemble import RandomForestClassifier
from numpy.core.umath_tests import inner1d

## Inputs

In [16]:
percent_training = 0.8
client = MongoClient(open('../../db.txt', encoding='UTF-8').read())
db = client.knowledge

## Data Collection

In [17]:
labels = db.labels

data = labels.find({'colorCount':{'$ne':None}}, {'_id':1,'url':1,'label':1,'colorCount':1,'mostPopular':1,'averageRed':1,'averageGreen':1,'averageBlue':1})

table = pd.DataFrame([item for item in data])
table = table.assign(educational=table['label'].map(lambda r: 1 if r=='y' else 0))
print(table.head())

                        _id  averageBlue  averageGreen  averageRed colorCount  \
0  5bd5202204c0591b776aeb2e     1.750000      1.750000    1.750000          1   
1  5bd5203804c0591b776aeb2f     1.326043      1.339848    1.367402        290   
2  5bd5267d04c0591b776aeb30     1.614444      1.593742    1.527081         39   
3  5bd5268104c0591b776aeb31     1.593981      1.563667    1.461685         85   
4  5bddd8d8fcab5fa6b04cb12c     1.666679      1.663924    1.660496         91   

  label mostPopular                                                url  \
0     y         777         https://id.pausd.org/portal/p/applications   
1     m         777                           https://www.youtube.com/   
2     y         777  https://www.kaggle.com/juustiiiin/textscraper/...   
3     y         777                            https://www.kaggle.com/   
4     y         777  https://github.com/boovines/dataLabeler/blob/m...   

   educational  
0            1  
1            0  
2            1  


## Feature Extraction

In [18]:
def vectorizeMe(averageRed, averageGreen, averageBlue, colorCount, mostPopular):
    return [float(averageRed), float(averageGreen), float(averageBlue), int(colorCount), int(mostPopular)]

interesting_rows = [r.tolist() for r in table[['averageRed','averageGreen', 'averageBlue','colorCount', 'mostPopular'][0:5]].values]

vectorized_rows = [vectorizeMe(row[0],row[1],row[2],row[3],row[4]) for row in interesting_rows]

table = table.assign(featureVector=vectorized_rows)

print(table.head())


                        _id  averageBlue  averageGreen  averageRed colorCount  \
0  5bd5202204c0591b776aeb2e     1.750000      1.750000    1.750000          1   
1  5bd5203804c0591b776aeb2f     1.326043      1.339848    1.367402        290   
2  5bd5267d04c0591b776aeb30     1.614444      1.593742    1.527081         39   
3  5bd5268104c0591b776aeb31     1.593981      1.563667    1.461685         85   
4  5bddd8d8fcab5fa6b04cb12c     1.666679      1.663924    1.660496         91   

  label mostPopular                                                url  \
0     y         777         https://id.pausd.org/portal/p/applications   
1     m         777                           https://www.youtube.com/   
2     y         777  https://www.kaggle.com/juustiiiin/textscraper/...   
3     y         777                            https://www.kaggle.com/   
4     y         777  https://github.com/boovines/dataLabeler/blob/m...   

   educational                                      featureVector  


## Training Data

In [21]:
row_count = int(percent_training*float(table.shape[0]))
training_data = table.sample(n=row_count)
training_data.info()
training_data.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 61 to 43
Data columns (total 10 columns):
_id              91 non-null object
averageBlue      91 non-null float64
averageGreen     91 non-null float64
averageRed       91 non-null float64
colorCount       91 non-null object
label            90 non-null object
mostPopular      91 non-null object
url              91 non-null object
educational      91 non-null int64
featureVector    91 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 7.8+ KB


Unnamed: 0,_id,averageBlue,averageGreen,averageRed,colorCount,label,mostPopular,url,educational,featureVector
61,5bece857300d3d1e28d76ad8,0.832581,0.817328,0.911253,471,m,0,https://www.gettyimages.com/,0,"[0.9112531523034154, 0.8173275913423352, 0.832..."
24,5bece448300d3d1e28d76aad,1.474992,1.465219,1.455902,292,n,777,https://happybean.naver.com/flower/brand/craft...,0,"[1.4559024391006403, 1.4652191736069429, 1.474..."
84,5bf06763faf3b3be87a3e98f,1.016379,0.545124,0.036764,68,y,24,https://www.duolingo.com/,1,"[0.03676432291666667, 0.5451243681066177, 1.01..."
65,5bece91e300d3d1e28d76add,1.394971,1.394304,1.400877,183,n,777,https://www.nfl.com/,0,"[1.4008769948884032, 1.3943039107616668, 1.394..."
98,5bf1fdf3faf3b3be87a3e9a0,1.374347,1.300397,1.267305,346,y,777,https://www.scientificamerican.com/,1,"[1.2673047893419542, 1.300396515538638, 1.3743..."
103,5c02c8d738fa0f4064808df7,1.079656,0.939087,0.834681,212,y,777,https://artofproblemsolving.com/,1,"[0.8346808678057553, 0.9390872302158273, 1.079..."
45,5bece62c300d3d1e28d76ac6,1.512421,1.515729,1.524185,387,n,777,https://www.alibaba.com/,0,"[1.5241846498552167, 1.5157288687356771, 1.512..."
110,5c0477f89661c440f365c4ba,1.75,1.75,1.75,1,n,777,http://www.ikoreantv.com/board.php?board=ikore...,0,"[1.75, 1.75, 1.75, 1, 777]"
102,5bf5ad79045971ddc40a0723,1.593828,1.563627,1.461738,85,y,777,https://www.kaggle.com/,1,"[1.4617379427279578, 1.563626601356443, 1.5938..."
56,5bece75e300d3d1e28d76ad2,1.401587,1.411346,1.438633,373,n,777,https://www.daum.net/,0,"[1.4386334668001335, 1.4113462629295963, 1.401..."


## Testing Data

In [20]:
training_data_ids = [_id for _id in training_data._id]
testing_data = table[~table['_id'].isin(training_data_ids)]
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23 entries, 3 to 110
Data columns (total 10 columns):
_id              23 non-null object
averageBlue      23 non-null float64
averageGreen     23 non-null float64
averageRed       23 non-null float64
colorCount       23 non-null object
label            23 non-null object
mostPopular      23 non-null object
url              23 non-null object
educational      23 non-null int64
featureVector    23 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 2.0+ KB
