# Feature Analysis of Website Imagery Content

## Imports

In [1]:
import numpy as np
import pandas as pd
import pprint
import os
import pymongo
import random
import re
import nltk
import pprint
from nltk.corpus import stopwords
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup
from pymongo import MongoClient
from PIL import Image
from gridfs import GridFS
from sklearn.ensemble import RandomForestClassifier
from numpy.core.umath_tests import inner1d

  from numpy.core.umath_tests import inner1d


## Inputs

In [2]:
percent_training = 0.85
n_estimators = 1000
client = MongoClient(open('../../db.txt', encoding='UTF-8').read())
db = client.knowledge

## Data Collection

In [3]:
labels = db.labels

data = labels.find({'colorCount':{'$ne':None}}, {'_id':1,'url':1,'label':1,'colorCount':1,'mostPopular':1,'averageRed':1,'averageGreen':1,'averageBlue':1})

table = pd.DataFrame([item for item in data])
table = table.assign(educational=table['label'].map(lambda r: 1 if r=='y' else 0))
print(table.head())

                        _id  averageBlue  averageGreen  averageRed colorCount  \
0  5bd5202204c0591b776aeb2e     1.750000      1.750000    1.750000          1   
1  5bd5203804c0591b776aeb2f     1.326043      1.339848    1.367402        290   
2  5bd5267d04c0591b776aeb30     1.614444      1.593742    1.527081         39   
3  5bd5268104c0591b776aeb31     1.593981      1.563667    1.461685         85   
4  5bddd8d8fcab5fa6b04cb12c     1.666679      1.663924    1.660496         91   

  label mostPopular                                                url  \
0     y         777         https://id.pausd.org/portal/p/applications   
1     m         777                           https://www.youtube.com/   
2     y         777  https://www.kaggle.com/juustiiiin/textscraper/...   
3     y         777                            https://www.kaggle.com/   
4     y         777  https://github.com/boovines/dataLabeler/blob/m...   

   educational  
0            1  
1            0  
2            1  


## Feature Extraction

In [4]:
def vectorizeMe(averageRed, averageGreen, averageBlue, colorCount, mostPopular):
    return [float(averageRed), float(averageGreen), float(averageBlue), int(colorCount), int(mostPopular)]

interesting_rows = [r.tolist() for r in table[['averageRed','averageGreen', 'averageBlue','colorCount', 'mostPopular'][0:5]].values]

vectorized_rows = [vectorizeMe(row[0],row[1],row[2],row[3],row[4]) for row in interesting_rows]

table = table.assign(featureVector=vectorized_rows)

print(table.head())
print(table.shape)


                        _id  averageBlue  averageGreen  averageRed colorCount  \
0  5bd5202204c0591b776aeb2e     1.750000      1.750000    1.750000          1   
1  5bd5203804c0591b776aeb2f     1.326043      1.339848    1.367402        290   
2  5bd5267d04c0591b776aeb30     1.614444      1.593742    1.527081         39   
3  5bd5268104c0591b776aeb31     1.593981      1.563667    1.461685         85   
4  5bddd8d8fcab5fa6b04cb12c     1.666679      1.663924    1.660496         91   

  label mostPopular                                                url  \
0     y         777         https://id.pausd.org/portal/p/applications   
1     m         777                           https://www.youtube.com/   
2     y         777  https://www.kaggle.com/juustiiiin/textscraper/...   
3     y         777                            https://www.kaggle.com/   
4     y         777  https://github.com/boovines/dataLabeler/blob/m...   

   educational                                      featureVector  


## Training Data

In [5]:
def get_training_data():
    row_count = int(percent_training*float(table.shape[0]))
    training_data = table.sample(n=row_count)
#    training_data.info()
#    training_data.head(10)
    return training_data

## Testing Data

In [6]:
def get_testing_data(training_data):
    training_data_ids = [_id for _id in training_data._id]
    testing_data = table[~table['_id'].isin(training_data_ids)]
#    testing_data.info()
    return testing_data

## Model Training

In [7]:
def training_model(training_data):
    forest = RandomForestClassifier(n_estimators=n_estimators) 
    forest = forest.fit(np.array([ np.array(row) for row in training_data["featureVector"]]), training_data["educational"] )
    return forest

## Prediction Analysis

In [8]:
def get_result(testing_data,forest):
    result = forest.predict(np.array([ np.array(row) for row in testing_data["featureVector"]]))
    return result

### Confusion Matrix

In [9]:
def create_confusion_matrix(testing_data,result):
    output = pd.DataFrame( data={"actual":testing_data['educational'], "predicted":result})

    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0


    actual = output['actual'].tolist()
    predicted = output['predicted'].tolist()
    event_count = float(output.shape[0]);

    for x in range(output.shape[0]):
        a = actual[x]
        p = predicted[x]
        if a == 1 and p == 1:
            true_positives += 1
        elif a == 1 and p == 0:
            false_negatives += 1
        elif a == 0 and p == 1:
            false_positives += 1
        elif a == 0 and p == 0:
            true_negatives += 1
    
    return float(true_positives)/event_count *100, float(true_negatives)/event_count * 100, float(false_positives)/event_count* 100, float(false_negatives)/event_count * 100, event_count
            
            
#display(HTML("""
#    <h3>Confusion Matrix</h3>
#    <table>
#        <tr>
#            <th><b>Confusion Matrix Cell</b></th><th>Term (P)</th><th>Value (P)</th>
#        </tr>
#        <tr>
#            <th>True Positive</th><th>Sensitivity</th><td>%1.0f%%</td>
#        </tr>
#        <tr>
#            <th>False Positive</th><th>Fall-Out Rate</th><td>%1.0f%%</td>
#        </tr>
#        <tr>
#            <th>False Negative</th><th>Miss Rate</th><td>%1.0f%%</td>
#        </tr>
#        <tr>
#            <th>True Negative</th><th>Specificity</th><td>%1.0f%%</td>
#        </tr>
#    </table>
#""" % (float(true_positives) / eventCount *100, float(false_positives) /eventCount*100, float(false_negatives)/eventCount*100,float(true_negatives)/eventCount*100)))

## Analyze

In [10]:
results = []

for i in range(10):
    training_data = get_training_data()
    testing_data = get_testing_data(training_data)
    model = training_model(training_data)
    result = get_result(testing_data, model)
    confusion_matrix = create_confusion_matrix(testing_data, result)
    results.append(confusion_matrix)
pprint.pprint(results)

[(8.75912408759124,
  51.82481751824818,
  14.5985401459854,
  24.817518248175183,
  137.0),
 (10.948905109489052,
  52.55474452554745,
  10.218978102189782,
  26.277372262773724,
  137.0),
 (17.51824817518248,
  42.33576642335766,
  12.408759124087592,
  27.73722627737226,
  137.0),
 (13.138686131386862,
  45.25547445255474,
  10.948905109489052,
  30.656934306569344,
  137.0),
 (13.86861313868613,
  48.9051094890511,
  15.328467153284672,
  21.897810218978105,
  137.0),
 (13.86861313868613,
  41.605839416058394,
  18.97810218978102,
  25.547445255474454,
  137.0),
 (15.328467153284672,
  48.9051094890511,
  6.569343065693431,
  29.1970802919708,
  137.0),
 (12.408759124087592,
  55.47445255474452,
  10.948905109489052,
  21.16788321167883,
  137.0),
 (15.328467153284672,
  45.98540145985402,
  20.437956204379564,
  18.248175182481752,
  137.0),
 (11.678832116788321,
  49.63503649635037,
  18.248175182481752,
  20.437956204379564,
  137.0)]
