Specify code parameters.

In [11]:
user = 'chris'  # 'chris' or 'elias'
embedding_type = 'tf'  # 'tf' or 'tf-df'
is_debug = False
K = 1000  # number of principal components

In [2]:
if user == 'elias':
    project_dir = 'C:/Users/Elias/Documents/DME/'
    data_dir = 'C:/Users/Elias/Documents/DME/data/'
elif user == 'chris':
    project_dir = '/Users/sipola/Google Drive/education/coursework/graduate/edinburgh/dme/project'
    data_dir = '/Users/sipola/Desktop/v2-all_in_one'
else:
    raise ValueError('Give valid user')

Import everything else.

In [136]:
os.chdir(project_dir)  # to import scripts

import os
import numpy as np
import re
import time
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
from scripts import *

%matplotlib inline

os.chdir(data_dir)  # since we'll be pulling everything from here

In [5]:
# Read labels in first so know which histids to keep when reading tfdf data.
y_full, keys_labels = read_transform_labels('labels/en.useful-labels.csv')
# print y_full.head()
# print keys_labels

In [8]:
# Create dictionary with {hostid: sparse vector}. There are something like 60,000 rows in the full data set (unless subsetting for hostids.).
t0 = time.time()
tfdf_filename = 'features/tfdf/v2-host_tfdf_en.txt' if user == 'elias' else 'features/tfdf/v2-host_tfdf.en.txt'
sparse = create_sparse_embeddings(tfdf_filename,
                                  hostids=y_full['ID'].tolist(),
                                  embedding_type=embedding_type)
print_run_time(t0)

run time: 0.46 min


In [12]:
if is_debug:
    # Look at entry for one hostid.
    print 'num rows in data: {}'.format(len(tf_idf_sparse))
    print 15*'='
    print 'first row:\nhostid: {}\nwordids and tf-idf values: {}'.format(tf_idf_sparse.keys()[0], tf_idf_sparse.values()[0])

In [15]:
# Convert to DataFrame.
X = pd.DataFrame.from_dict(sparse, orient='index')  # convert to DataFrame

if embedding_type == 'tf-idf':
    
    min_val = min(X.min())  # get min of entire matrix
    if is_debug:
        print 'minimum value: {}'.format(min_val)
    X = X - min_val  # make so lowest value is zero

X = X.fillna(0)  # fill NaNs with zeros

if is_debug:
    X  # show X

In [16]:
if is_debug:
    print X.shape

In [17]:
hostids = X.index.values  # save for later

In [45]:
X_scaled = StandardScaler().fit_transform(X)  # is this how you do this??????

In [46]:
# Fit PCA incrementally (otherwise there are memory issues)
# http://stackoverflow.com/a/32191686
t0 = time.time()
ipca = IncrementalPCA(n_components=K)
X_fit = ipca.fit_transform(X_scaled)
print_run_time(t0)

run time: 0.94 min


In [85]:
if is_debug:
    sns.distplot(X_fit.ix[:,1])
    print np.std(X_fit)
    print X_fit.shape

In [48]:
X_fit = pd.DataFrame(X_fit)
X_fit['ID'] = hostids
if is_debug:
    X_fit

In [87]:
X_full = pd.merge(y_full, X_fit, on='ID')

In [118]:
# y_full.ix[y_full.loc[:,'ID'] == 88495]

In [119]:
# X_full.ix[X_full.duplicated(subset='ID', keep=False)]
if is_debug:
    print 'numer of rows: {}'.format(X_full.shape)
X_full = X_full.drop_duplicates(subset='ID')
if is_debug:
    print 'numer of rows: {}'.format(X_full.shape)

In [127]:
# if is_debug:
#     # Why are there now more rows? Duplicates in label data?
#     print 'num IDs in full but not in fit: {}'.format(sum([x not in X_fit['ID'].tolist() for x in X_full['ID'].tolist()]))  # 0
#     print 'num IDs in labels: {}'.format(len(y_full['ID'].tolist()))
#     print 'num *unique* IDs in labels: {}'.format(len(set(y_full['ID'].tolist())))

#     # Answer: yes, duplicates in label data.

In [132]:
X_train_keys = X_fit.keys().drop('ID')

In [342]:
if is_debug:
    print keys_labels

In [225]:
def get_train_valid_data(X_full, label, X_train_keys, is_debug=False):
    
    # Remove rows with bad target data.
    if is_debug:
        print 'dropping {} out of {} rows for bad {} data...'.format(sum(X_full.loc[:,label] == 0), X_full.shape[0], label)
    X_ = X_full.ix[X_full.loc[:,label] != 0]
    
    # Add y column (binary: 0, 1).
    X_.y = [int(x) for x in X_.loc[:,label] == 1]
    
    # Drop unneeded columns.
    keep_col_bool = [c in X_train_keys or c=='y' for c in X_.columns]
    X_ = X_.loc[:, keep_col_bool]
    
    # Split into test and validation.
    # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    X_train, X_valid, y_train, y_valid = train_test_split(X_[X_train_keys], X_['y'])
    
    return X_train, X_valid, y_train, y_valid

# get_train_valid_data(X_full, labels[0], X_train_keys)[0]

In [343]:
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

labels = ['News/Editorial', 'Commercial', 'Educational/Research', 'Discussion', 'Personal/Leisure', 'Media']
models_dict = {'dummy': DummyClassifier(),
               'logistic regression': LogisticRegression(),
               'naive Bayes': GaussianNB(),
               'MLP': None  #MLPClassifier()  # needs K, so defined in K loop
              }
K_to_use = [1, 2, 3, 4, 5, 7, 10, 25, 100, 250, 1000]

stats = {}

for label in labels:
    
    print '\n=====\nlabel: {}\n====='.format(label)
    label_stats = {}

    for model_name, model in models_dict.iteritems():

        print 'model: {}'.format(model_name)
        model_stats = {}

        for kk in K_to_use:

            K_stats = {}
            
            if model_name == 'MLP':
                model = MLPClassifier(hidden_layer_sizes=(kk, np.ceil(np.sqrt(kk)),))

            # print '\trunning k={}...'.format(kk)
            # print model

            X_train, X_valid, y_train, y_valid = get_train_valid_data(X_full, label, X_train_keys)
            X_train = X_train.loc[:, range(kk)]
            X_valid = X_valid.loc[:, range(kk)]

            fit = model.fit(X_train, y_train)

            K_stats['acc(train)'] = fit.score(X_train, y_train)
            K_stats['acc(valid)'] = fit.score(X_valid, y_valid)
            K_stats['f1(train)'] = f1_score(y_train, fit.predict(X_train))
            K_stats['f1(valid)'] = f1_score(y_valid, fit.predict(X_valid))

            model_stats[kk] = K_stats

        label_stats[model_name] = model_stats
        
    stats[label] = label_stats


=====
label: News/Editorial
=====
model: dummy
model: MLP
model: naive Bayes
model: logistic regression

=====
label: Commercial
=====
model: dummy
model: MLP
model: naive Bayes
model: logistic regression

=====
label: Educational/Research
=====
model: dummy
model: MLP
model: naive Bayes
model: logistic regression

=====
label: Discussion
=====
model: dummy
model: MLP
model: naive Bayes
model: logistic regression

=====
label: Personal/Leisure
=====
model: dummy
model: MLP
model: naive Bayes
model: logistic regression

=====
label: Media
=====
model: dummy
model: MLP
model: naive Bayes
model: logistic regression


In [347]:
label = labels[1]
pd.DataFrame(stats[label]['dummy'])

Unnamed: 0,1,2,3,4,5,7,10,25,100,250,1000
acc(train),0.538776,0.518367,0.515306,0.520408,0.508163,0.5,0.504082,0.507143,0.506122,0.494898,0.483673
acc(valid),0.492355,0.538226,0.461774,0.513761,0.495413,0.510703,0.541284,0.474006,0.434251,0.53211,0.48318
f1(train),0.418431,0.443407,0.444982,0.438257,0.417736,0.449883,0.455491,0.446784,0.441463,0.427046,0.458182
f1(valid),0.438849,0.534247,0.490066,0.40404,0.390071,0.372414,0.386861,0.467797,0.429066,0.462069,0.482759


In [348]:
pd.DataFrame(stats[label]['logistic regression'])

Unnamed: 0,1,2,3,4,5,7,10,25,100,250,1000
acc(train),0.572449,0.567347,0.570408,0.572449,0.564286,0.585714,0.587755,0.608163,0.673469,0.739796,0.882653
acc(valid),0.544343,0.559633,0.553517,0.559633,0.590214,0.587156,0.593272,0.648318,0.605505,0.559633,0.614679
f1(train),0.0,0.0,0.004728,0.027842,0.040449,0.157676,0.158333,0.278195,0.457627,0.584013,0.851995
f1(valid),0.0,0.0,0.0,0.013699,0.042857,0.081633,0.203593,0.294479,0.351759,0.414634,0.536765


In [349]:
pd.DataFrame(stats[label]['MLP'])

Unnamed: 0,1,2,3,4,5,7,10,25,100,250,1000
acc(train),0.556122,0.564286,0.539796,0.568367,0.542857,0.60102,0.594898,0.573469,0.680612,0.705102,0.902041
acc(valid),0.593272,0.562691,0.571865,0.559633,0.568807,0.590214,0.559633,0.553517,0.620795,0.626911,0.550459
f1(train),0.0,0.009281,0.548549,0.004706,0.013216,0.290381,0.194726,0.264085,0.470389,0.546311,0.894273
f1(valid),0.0,0.0,0.542484,0.0,0.0,0.238636,0.152941,0.231579,0.38,0.39604,0.592798


In [328]:
# # %load ../display_2d.py
# ###
# # plot some of the data in 2d (with PCA and MDS)
# # mainly copied from lab 3
# ###
# # INPUT:
# #	fulldata from join_data.py
# #	keys_content_features: array(String) of column keys
# # PRINTS:
# #	2d graphs using PCA and MDS
# ###

# from scripts import *

# X = X_full[keys_tfdf]
# y = y_full['Commercial']

# ### From Lab 3, edited
# def scatter_2d_label(X_2d, y, alpha=0.5):
#     """Visualuse a 2D embedding with corresponding labels.

#     X_2d : ndarray, shape (n_samples,2)
#         Low-dimensional feature representation.

#     y : ndarray, shape (n_samples,)
#         Labels corresponding to the entries in X_2d.

#     s : float
#         Marker size for scatter plot.

#     alpha : float
#         Transparency for scatter plot.

#     lw : float
#         Linewidth for scatter plot.
#     """
#     targets = np.unique(y)

#     colors = [plt.cm.RdYlGn( int(i*plt.cm.RdYlGn.N/(len(targets)-1)) ) for i in range(len(targets))]
#     for color, target in zip(colors, targets):
#         plotx = [x for i,x in enumerate(X_2d[:,0]) if (y[i]==target)]
#         ploty = [x for i,x in enumerate(X_2d[:,1]) if (y[i]==target)]
#         plt.scatter(plotx, ploty, color=color, label=target, alpha=alpha)
#     plt.legend()
#     plt.show()

# from sklearn.preprocessing import StandardScaler
# X_sc = StandardScaler().fit_transform(X.astype(np.float))


# ### PCA

# from sklearn.decomposition import KernelPCA

# plt.figure(figsize=(10,6))
# pca = KernelPCA( n_components=2, kernel='rbf' )
# X_2d = pca.fit_transform(X_sc)
# scatter_2d_label(X_2d, y)
# plt.show()

# ### MDS

# from sklearn.manifold import MDS
# sns.set(font_scale=1.)
# mds = MDS(n_components=2, metric=True, n_init=1, max_iter=100, random_state=10)
# X_mds_2d = mds.fit_transform(X_sc)
# plt.title('Metric MDS, stress: {}'.format(mds.stress_))
# plt.xlabel('Component 1')
# plt.ylabel('Component 2')
# plt.legend(loc='center left', bbox_to_anchor=[1.01, 0.5], scatterpoints=3)
# scatter_2d_label(X_mds_2d, y)
# plt.show()