In [4]:
import os

project_dir = '/Users/sipola/Google Drive/education/coursework/graduate/edinburgh/dme/project'
data_dir = '/Users/sipola/Desktop/v2-all_in_one'

os.chdir(project_dir)

In [5]:
import numpy as np
import re
import time
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
import seaborn as sns
from scripts import *

In [6]:
os.chdir(data_dir)

In [23]:
def print_run_time(t0):
    '''t0 is the start time, given by time.time()'''
    print 'run time: {:0.2f} min'.format((time.time() - t0)/60)
    

def tf_idf(tf, df, N):
    '''
    tf-idf (term frequency - inverse document frequency) calculation, which we'll use for the word embeddings.
    Source: http://www.inf.ed.ac.uk/teaching/courses/nlu/lectures/nlu_l02-vsm-2x2.pdf
    '''
    return (1 + np.log2(tf)) * np.log2(N / float(df))


def create_sparse_embeddings(filename, hostids=None, embedding_type='tf-idf'):
    '''
    Takes v2-host_tfdf.en.txt and outputs sparse 
    
    input:
        filename (str): file path to v2-host_tfdf.en.txt (originally in v2-all_in_one/features/tfdf/)
        hostids (list of int): only pull these hostids
        embedding_type (str): either 'tf-idf' or 'tf'

    output:
        dictionary with format {hostid1: [(wordid1, embedding_value), (wordid2, embedding_value), ...], hostid2: [(wordid7, embedding_value), ...]}
    '''
    
    # Read raw data from file.
    with open(filename, 'r') as f:
        raw_data = f.readlines()
    
    N = len(raw_data)  # N in tf-idf calculation
    sparse = {}  # initialize dictionary
    
    for line in raw_data:
        
        line = re.sub('\n', '', line)  # remove \n at end
        split_line = line.split(' ')
        
        # For structure of file, see: https://dms.sztaki.hu/node/350
        
        # First element is hostid.
        hostid = int(split_line.pop(0))
        
        if hostids and hostid not in hostids:
            continue
        
        sparse[hostid] = {}
        
        # We then have groups of three: wordid1 tf1 df1, wordid2 tf2 df2, ...
        while len(split_line) >= 3:
            
            # Remove first three elements.
            wordid, tf, df = [int(s) for s in split_line[:3]]
            del split_line[:3]
            
            # Add to dictionary.
            if embedding_type == 'tf-idf':
                sparse[hostid][wordid] = tf_idf(tf, df, N)
            elif embedding_type == 'tf':
                sparse[hostid][wordid] = tf
            else:
                raise ValueError('embedding_type needs to be \'tf-idf or tf')
            
    return sparse

In [24]:
y_full, keys_labels = read_transform_labels('labels/en.useful-labels.csv')
print y_full.head()
print keys_labels

      ID  UserID                Date Hosting Type Language  Adult Content  \
0  89329       7  31-MAR-10 15:41:26     HTNormal       en           -1.0   
1  47708       8  14-APR-10 13:46:01     HTNormal       en           -1.0   
2  41567       6  31-MAR-10 14:18:09     HTNormal       en           -1.0   
3  98619       8  15-APR-10 11:59:36     HTNormal       en           -1.0   
4  72806       8  15-APR-10 11:54:25     HTNormal       en           -1.0   

   Other Problem  Web Spam  News/Editorial  Commercial    ...      \
0           -1.0        -1             1.0        -1.0    ...       
1           -1.0        -1            -1.0         1.0    ...       
2           -1.0        -1            -1.0         1.0    ...       
3           -1.0        -1            -1.0        -1.0    ...       
4           -1.0        -1            -1.0        -1.0    ...       

   Personal/Leisure  Media  Database  Readability-Vis  Readability-Lang  \
0               1.0   -1.0      -1.0           

In [27]:
# Create dictionary with {hostid: sparse vector}. There are something like 60,000 rows in the full data set.
t0 = time.time()
sparse = create_sparse_embeddings('features/tfdf/v2-host_tfdf.en.txt',
                                       hostids=y_full['ID'].tolist(),
                                      embedding_type='tf-idf')
print_run_time(t0)

run time: 0.49 min


In [10]:
# Look at entry for one hostid.
print 'num rows in data: {}'.format(len(sparse))
print 15*'='
print 'first row:\nhostid: {}\nwordids and embedding values: {}'.format(sparse.keys()[0], sparse.values()[0])

num rows in data: 1351
first row:
hostid: 59392
wordids and tf-idf values: {2624: -3.9123593413580604, 382: -10.493377243442378}


In [11]:
# Convert to DataFrame.
X = pd.DataFrame.from_dict(sparse, orient='index')  # convert to DataFrame
min_val = min(X.min())  # get min of entire matrix
print 'minimum tf-idf value: {}'.format(min_val)
X = X - min_val  # make so lowest value is zero
X = X.fillna(0)  # fill NaNs with zeros
X  # show X

minimum tf-idf value: -101.040003592


Unnamed: 0,2624,382,0,1,2,3,8196,5,6,8,...,13106,29589,15539,31965,31977,37527,48801,49717,40424,30884
61,0.000000,94.044419,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82,97.127644,88.500549,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199,0.000000,94.044419,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
233,0.000000,0.000000,83.005329,90.234054,0.000000,0.000000,0.0,95.521499,78.435327,89.115801,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,95.521499,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
274,98.431764,88.500549,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
341,0.000000,0.000000,60.632428,76.267566,80.660655,80.757340,0.0,64.375851,71.823883,83.153700,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
352,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,95.077903,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
392,98.431764,91.998341,74.940980,0.000000,0.000000,73.925674,0.0,90.002994,95.388834,67.390763,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
445,0.000000,0.000000,95.611026,0.000000,0.000000,96.616268,0.0,0.000000,0.000000,95.077903,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X.shape

(1351, 48713)

In [22]:
hostids = X.index.values  # save for later
if len(hostids) != len(set(hostids)):
    print 'Dupes!'
else:
    print 'No dupes! Good job!'

No dupes! Good job!


In [14]:
# Fit PCA incrementally (otherwise there are memory issues)
# http://stackoverflow.com/a/32191686

K = 1000
t0 = time.time()
ipca = IncrementalPCA(n_components=K)
X_fit = ipca.fit_transform(X)
print_run_time(t0)

run time: 0.76 min


In [15]:
X_fit.shape

(1351, 1000)

In [16]:
X_fit = pd.DataFrame(X_fit)
X_fit['ID'] = hostids
X_fit

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,ID
0,-905.446638,-374.151162,-56.027159,-140.466825,-70.459015,21.215584,66.130943,36.934564,21.574419,28.759194,...,-4.471246,-16.707247,-4.142018,-7.047953,3.688253,-4.401643,10.144022,-4.486396,6.821083,61
1,-905.818059,-375.527204,-57.410499,-141.530670,-70.764193,21.249667,67.271557,39.099516,21.504454,29.546629,...,7.631939,0.862441,-4.950310,4.272343,0.003950,4.828647,-2.333556,6.428934,0.815300,82
2,-905.446638,-374.151162,-56.027159,-140.466825,-70.459015,21.215584,66.130943,36.934564,21.574419,28.759194,...,-4.471246,-16.707247,-4.142018,-7.047953,3.688253,-4.401643,10.144022,-4.486396,6.821083,199
3,-321.186019,-7.648020,-115.755742,68.246567,-125.079085,-71.482650,-62.244273,-70.556283,123.174950,157.672279,...,-3.164128,0.222905,4.363905,-1.212241,0.574671,1.135598,-0.723583,-0.033384,0.480367,233
4,-836.739376,-301.653918,-14.702181,-83.183335,-53.337711,32.080175,47.531603,24.402056,-5.209350,-11.758446,...,-8.110312,51.997907,-14.600392,20.139529,-6.225973,-38.210726,-17.232673,-6.116981,88.008968,245
5,-905.807457,-375.525117,-57.421558,-141.531756,-70.767403,21.244389,67.273881,39.116656,21.518202,29.545027,...,7.766594,0.951619,-4.996038,4.395353,0.003745,4.926156,-2.379184,6.540447,0.811954,274
6,170.872339,436.536482,-176.589521,257.403472,22.401011,-363.494160,-217.214322,-118.747677,-177.494304,-177.527327,...,-1.926992,1.699964,-1.128056,1.234531,-0.800203,2.122791,2.142105,-1.161618,-2.006201,341
7,-805.424603,-291.736144,-54.190935,-81.210639,-47.989714,4.580130,25.315897,15.606717,30.050023,33.846715,...,-5.597157,-21.636662,-2.679111,0.952231,16.647550,28.313524,3.439671,-3.242405,4.200601,352
8,-743.952025,-228.695291,-86.304957,-33.994562,-17.155953,-40.067039,-15.256480,40.033828,36.869219,-46.372179,...,-16.766539,-13.015768,-2.806436,-1.580675,4.057215,-14.904815,-9.021366,-17.817737,-1.986256,392
9,-887.560511,-354.378543,-55.970688,-123.358443,-60.838923,19.338341,55.340323,31.597587,17.015602,21.826521,...,11.321752,-1.290182,9.239284,1.220833,-14.647522,-11.337292,-12.086144,-27.125088,-4.863262,445


In [17]:
X_full = pd.merge(y_full, X_fit, on='ID')

In [82]:
X_full.shape

(1522, 1022)

In [107]:
# Why are there now more rows? Duplicates in label data?
print 'num IDs in full but not in fit: {}'.format(sum([x not in X_fit['ID'].tolist() for x in X_full['ID'].tolist()]))  # 0
print 'num IDs in labels: {}'.format(len(y_full['ID'].tolist()))
print 'num *unique* IDs in labels: {}'.format(len(set(y_full['ID'].tolist())))

# Answer: yes, duplicates in label data.

num IDs in full but not in fit: 0
num IDs in labels: 1522
num *unique* IDs in labels: 1351


# Classify

In [56]:
X_train = X_full[range(K)]
Y_train = X_full[['News/Editorial', 'Commercial', 'Educational/Research', 'Discussion', 'Personal/Leisure', 'Media']]
print X_full.keys()[:23]

Index([                  u'ID',               u'UserID',
                       u'Date',         u'Hosting Type',
                   u'Language',        u'Adult Content',
              u'Other Problem',             u'Web Spam',
             u'News/Editorial',           u'Commercial',
       u'Educational/Research',           u'Discussion',
           u'Personal/Leisure',                u'Media',
                   u'Database',      u'Readability-Vis',
           u'Readability-Lang',           u'Neutrality',
                       u'Bias',           u'Trustiness',
                 u'Confidence',            u'auto_lang',
                             0],
      dtype='object')


In [53]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-587.278884,-114.68952,-24.72962,-21.32446,-22.131661,26.308887,-49.197127,-2.112385,-43.129204,2.82748,...,-17.73853,-42.109599,38.147013,-21.206679,-1.248409,35.184957,30.664051,3.497696,25.925842,-17.719158
1,-496.168224,-74.629381,-86.708783,-2.977229,8.040063,6.769854,-79.551349,1.784859,-34.461468,14.166739,...,16.790465,37.977137,-36.135798,17.898968,-0.364631,-28.545799,-21.471501,-0.262702,-22.140483,14.238383
2,-901.30928,-369.025454,-55.995384,-129.561031,-68.763523,17.088251,64.008363,34.572434,21.725804,24.039757,...,0.199576,4.62609,-0.716321,-10.702722,-4.795528,9.178513,-2.793316,-5.561962,-0.390044,-5.292809
3,871.597275,501.556149,1133.549437,286.14925,847.750933,-251.27134,643.64319,-63.731164,-242.46182,624.654987,...,0.245317,-0.197429,-0.316277,-0.408833,-1.000161,-0.964203,-0.197106,1.219583,0.145496,0.215532
4,-838.477902,-306.564622,-15.550692,-102.178985,-53.955364,28.75276,53.938557,15.545452,-15.223534,39.178811,...,-30.761335,9.611292,25.248794,8.083091,-36.021866,2.595565,-8.8194,-12.60526,28.746259,-9.313063


In [54]:
Y_train.head()

Unnamed: 0,ID,UserID,Date,Hosting Type,Language,Adult Content,Other Problem,Web Spam,News/Editorial,Commercial,...,Personal/Leisure,Media,Database,Readability-Vis,Readability-Lang,Neutrality,Bias,Trustiness,Confidence,auto_lang
0,89329,7,31-MAR-10 15:41:26,HTNormal,en,-1.0,-1.0,-1,1.0,-1.0,...,1.0,-1.0,-1.0,2.0,2.0,2.0,2.0,3.0,1.0,en
1,47708,8,14-APR-10 13:46:01,HTNormal,en,-1.0,-1.0,-1,-1.0,1.0,...,1.0,-1.0,-1.0,2.0,2.0,3.0,2.0,3.0,1.0,en
2,41567,6,31-MAR-10 14:18:09,HTNormal,en,-1.0,-1.0,-1,-1.0,1.0,...,-1.0,-1.0,-1.0,2.0,2.0,3.0,2.0,3.0,1.0,en
3,98619,8,15-APR-10 11:59:36,HTNormal,en,-1.0,-1.0,-1,-1.0,-1.0,...,-1.0,-1.0,-1.0,2.0,2.0,3.0,2.0,3.0,1.0,en
4,72806,8,15-APR-10 11:54:25,HTNormal,en,-1.0,-1.0,-1,-1.0,-1.0,...,-1.0,-1.0,-1.0,2.0,2.0,3.0,2.0,3.0,1.0,en


In [None]:
fit_logreg = LogisticRegression().fit(X,y)
print('LogisticRegression accuracy: {}'.format(fit_logreg.score(X,y)))