In [None]:
import pandas as pd
from sklearn import linear_model
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn import linear_model,ensemble,metrics,preprocessing
import numpy as np
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

In [None]:
df = pd.read_csv('/Users/richardknoche/Desktop/MainImages.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df = df.dropna()

#Randomize the data
df = df.sample(frac=1,random_state=102)

In [None]:
#Not using categoricals for now
features = ['price','original_price','age', 'width', 'height',
       'depth', 'Num_Images', 'Duration', 
       'default_angle', 'Thirds_Horizontal_Value_Sym', 'Salient_Hue',
       'Complimentary_Color_Level', 'Thirds_To_Focal_Distance',
       'Vertical_Hue_Sym', 'Laplacian_Sharpness', 'Salient_Value',
       'Thirds_Horizontal_Saliency_Sym', 'Thirds_Value', 'Thirds_Sat',
       'B_Mean', 'V_var', 'Thirds_Saliency', 'g_ygrad', 'Busyness',
       'Horizontal_Saturation_Sym', 'Thirds_Vertical_Saliency_Sym', 'r_ygrad',
       'B_Width', 'g_xgrad', 'G_Mean', 'g_xgrad_std',
       'Thirds_Vertical_Hue_Sym', 'Thirds_Horizontal_Hue_Sym', 'Colorfulness',
       'Vertical_Value_Sym', 'R_xgrad', 'Horizontal_Value_Sym',
       'Salient_Saturation', 'standard_luminance', 'H_var',
       'Thirds_Vertical_Value_Sym', 'Number_of_Contours',
       'Thirds_Vertical_Saturation_Sym', 'R_Width', 'FFT_Sharpness',
       'b_xgrad_std', 'V_mean', 'b_ygrad', 'R_Mean',
       'Thirds_Horizontal_Saturation_Sym', 'b_xgrad', 'H_mean', 'r_ygrad_std',
       'Histogram_Darkness', 'g_ygrad_std', 'Thirds_Hue', 'S_mean', 'S_var',
       'G_Width', 'Lapacian_Saturation', 'percieved_luminace', 'b_ygrad_std',
       'r_xgrad_std', 'Horizontal_Hue_Sym', 'Vertical_Saturation_Sym',
       'Lapacian_Value', 'Lapacian_Hue']

In [None]:
#top x%
percentiles = [0.2,0.4,0.6,0.8,1.0]

for percent in percentiles:
    #Make CDF and figure out threshold for "top_x_percent"
    count,edge,img = plt.hist(df['view_rate'],bins=np.linspace(0,0.001,500))
    cdf = count.cumsum()/count.sum()
    center = (edge[:-1]+edge[1:])/2
    thresh_idx=len(cdf)-len(cdf[cdf> (1-percent)]) #Index of where top_x_percent starts in cdf
    df['Top %0.1f Percentile' % (percent)] = df['view_rate']>center[thresh_idx] #Top x% of views

In [None]:
'classifier_%s0' %(str(percent)[-1])

In [None]:
fig_roc = plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')

fig_pr = plt.figure(2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR curve')

for percent in percentiles:

    col_to_predict = 'Top %0.1f Percentile' % (percent)

    num_minority = min(len(df[df[col_to_predict]==False]),len(df[df[col_to_predict]==True]))
    true_df = df[df[col_to_predict]==True]
    false_df = df[df[col_to_predict]==False]

    train_true = true_df.sample(n=int(np.floor(num_minority*0.6)),random_state = 102)
    train_false = false_df.sample(n=int(np.floor(num_minority*0.6)),random_state = 102)

    train = pd.concat([train_true,train_false])
    test = df.drop(train.index)
    
    train_x = train[features]
    train_y = train[col_to_predict]

    test_x = test[features]
    test_y = test[col_to_predict]
    
    #Normalize features
    train_std_scale = preprocessing.StandardScaler().fit(train_x)
    train_x_std = train_std_scale.transform(train_x)

    train_minmax_scale = preprocessing.MinMaxScaler().fit(train_x)
    train_x_minmax = train_minmax_scale.transform(train_x)

    test_std_scale = preprocessing.StandardScaler().fit(test_x)
    test_x_std = test_std_scale.transform(test_x)

    test_minmax_scale = preprocessing.MinMaxScaler().fit(test_x)
    test_x_minmax = test_minmax_scale.transform(test_x)

    train_x_features = train_x_std
    test_x_features = test_x_std

    #Train Model
    clf  = ensemble.RandomForestClassifier()
    classifier_name = 'classifier_%s0' %(str(percent)[-1])
    clf.fit(train_x_features,train_y)
    exec('%s = clf' % (classifier_name))
    predicted_y = clf.predict(test_x_features)
    y_probs =clf.predict_proba(test_x_features)[:, 1]

    #ROC
    plt.figure(fig_roc.number)
    fpr, tpr, thresholds = metrics.roc_curve(test_y, y_probs)
    plt.plot(fpr, tpr, label='Top %0.1f' % percent)


    #Accuracy, Precision, Recall
    print('Accuracy: ',clf.score(test_x_features,test_y))
    f1score, tpr, thresholds = metrics.roc_curve(test_y, y_probs)

    #Prec-Recall Curve
    precision, recall, thresholds = precision_recall_curve(test_y,y_probs)
    average_precision = average_precision_score(test_y, y_probs)
    plt.figure(fig_pr.number)
    plt.plot(recall, precision,label='RF')


    precision, recall, fscore, support = metrics.precision_recall_fscore_support(test_y, predicted_y)
    #classifier_metrics = pd.DataFrame( {'Precision' : precision, 'Recall' : recall, 'F-Score' : fscore, 'Support' : support})
    #classifier_metrics

legend_info = ['Coin Flip']+ [ 'Top %0.1f Classifier' % (i) for i in percentiles]
plt.legend(legend_info[1:],loc=3)
plt.figure(fig_roc.number)
plt.legend(legend_info,loc=4)



In [None]:
def CalcPrediction(row):
    top_20 = classifier_20.predict(row.reshape(1,-1))
    top_40 = classifier_40.predict(row.reshape(1,-1))
    top_60 = classifier_60.predict(row.reshape(1,-1))
    top_80 = classifier_80.predict(row.reshape(1,-1))
    row['predicted_percentile'] = pd.Series([20,40,60,80])[[top_20[0],top_40[0],top_60[0],top_80[0]]].min()

    
#THIS GETS SCREWY HERE.. SINCE "TEST_X" CHANGES WITH EACH CLASSIFIER
#WE COULD RUN THIS OVER EVERY ENTRY, BUT THEN SOME ENTRIES WERE TRAINED WITH THAT CLASSIFIER...
test_x.apply(CalcPrediction,axis=1)

In [None]:
#Normalize features
train_x = preprocessing.normalize(train_x)
test_x = preprocessing.normalize(test_x)

#Train Model
clf  = ensemble.RandomForestClassifier()
clf.fit(train_x,train_y)
predicted_y = clf.predict(test_x)
y_probs =clf.predict_proba(test_x)[:, 1]

#ROC
fpr, tpr, thresholds = metrics.roc_curve(test_y, y_probs)
plt.figure()
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='RF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()


#Accuracy, Precision, Recall
print('Accuracy: ',clf.score(test_x,test_y))
f1score, tpr, thresholds = metrics.roc_curve(test_y, y_probs)

precision, recall, thresholds = precision_recall_curve(test_y,y_probs)
average_precision = average_precision_score(test_y, y_probs)
print(average_precision)


#Prec-Recall Curve
plt.figure()
plt.plot([0, 1], [1, 1], 'k--')
plt.plot(recall, precision,label='RF')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR curve')
plt.legend(loc='best')
plt.show()


precision, recall, fscore, support = metrics.precision_recall_fscore_support(test_y, predicted_y)
classifier_metrics = pd.DataFrame( {'Precision' : precision, 'Recall' : recall, 'F-Score' : fscore, 'Support' : support})
classifier_metrics

In [None]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

rankings = []
for idx in range(len(features)):
    rankings.append(importances[indices[idx]])
    print("%d. %s %f" % (idx + 1, features[idx], importances[indices[idx]]))

In [None]:
plt.figure(figsize=(14,7))
plt.title("Feature importances")
plt.bar(range(len(features)), importances[indices],
       color="r", align="center")
plt.xticks(range(len(features)))
ax=plt.gca()
ax.set_xticklabels([features for (ranks,features) in sorted(zip(rankings,features),reverse=True)],rotation=90)
plt.xlim([-1, len(features)])
plt.show()