# Importing data in 2 formats (from file-format.jpynb)

* real-sim.bz2 -- row format, contains target
* X.csv -- column format
* Y.csv -- target as 1 column

# Implementation of coordinate descent optimization algorithm

In [1]:
import pandas as pd
import numpy as np
import bz2

In [143]:
df = pd.read_csv('data/real-sim.bz2', compression='bz2', header=None, sep=',', quotechar='"')

In [144]:
df # firt value of each row   = 1 or -1  ( target)

Unnamed: 0,0
0,1 957:0.5162621139261989 3713:0.50478811279483...
1,1 60:0.1268484049473893 76:0.1312390760167834 ...
2,1 60:0.3046276827544872 148:0.1870145280449209...
3,1 130:0.03722415715765207 370:0.45297469922461...
4,1 130:0.08474200311834026 148:0.13987013821176...
...,...
72304,-1 2:0.04830334445297421 16:0.0659941311415835...
72305,-1 52:0.2115341677526436 238:0.301596914695018...
72306,-1 12:0.07936380088938398 34:0.084371169437365...
72307,-1 423:0.3066420253659734 648:0.19858377662854...


## Not to use - creating whole dataframe   72309 x 20958, using float16 dtype (less precise than usually)

In [6]:
sample = df.head(30)
X = pd.DataFrame(0.0, index=np.arange(len(df)), columns=np.arange(0,20958), dtype = "float16")

In [22]:
# slow

target_list = list()

for i in range(len(df)):
    splittedData = df.loc[i].map(lambda x: x.split(" "))[0]
    
    aux = splittedData[1:]
    if( i % 4000 == 0):
        print("progress = ", str(i / 800.0), " %")
    
    for j in range(len(aux)-1):
        splittedRow = aux[j].split(":")
        col = int(splittedRow[0])
        val = float(splittedRow[1])
        #print(i)
        #print(col)
        #print(val)
        X.iloc[i][col] = val
    
    target_list.append(int(splittedData[0]))

target_df = pd.DataFrame(target_list)

progress =  0.0  %
progress =  5.0  %
progress =  10.0  %
progress =  15.0  %
progress =  20.0  %
progress =  25.0  %
progress =  30.0  %
progress =  35.0  %
progress =  40.0  %
progress =  45.0  %
progress =  50.0  %
progress =  55.0  %
progress =  60.0  %
progress =  65.0  %
progress =  70.0  %
progress =  75.0  %
progress =  80.0  %
progress =  85.0  %
progress =  90.0  %


In [24]:
target_df.to_csv("data/Y.csv")

In [18]:
df.head()

Unnamed: 0,0
0,1 957:0.5162621139261989 3713:0.50478811279483...
1,1 60:0.1268484049473893 76:0.1312390760167834 ...
2,1 60:0.3046276827544872 148:0.1870145280449209...
3,1 130:0.03722415715765207 370:0.45297469922461...
4,1 130:0.08474200311834026 148:0.13987013821176...


# Loading data in column format (X.csv and Y.csv)

In [64]:
X = pd.read_csv("data/X.csv", index_col=0)
X.drop(index=X.index[0], 
        axis=0, 
        inplace=True)
Y = pd.read_csv("data/Y.csv", index_col = 0)

In [65]:
X

Unnamed: 0,0
1,6188:0.04628995985226547 7165:0.0353995993010...
2,549:0.05240809607488428 606:0.049347750441681...
3,377:0.05347891084164088 1681:0.04386643306872...
4,2673:0.04206336669554234 2675:0.0720097559738...
5,22:0.0107468031372892 25:0.01321220008362658 ...
...,...
20954,12375:0.5457710942161524 13429:0.300113707924...
20955,13707:0.1425559557500663 13877:0.089547451858...
20956,14554:0.2984538217929392 20386:0.067982958940...
20957,14807:0.2231273036113095 14823:0.192955498034...


In [96]:
# given a dataframe in column format (X) - get the column of given index 
def index2Col(X, index):
    row = X.iloc[index][0][1:]
    col = np.zeros(shape = (72309,1), dtype = "float64")
    splitted_row = row.split(" ")
    
    for s in splitted_row:
        aux = s.split(":")
        if len(aux) == 2:
            col[int(aux[0]),0] = float(aux[1])
    return col

# given a dataframe in row format (df from real-sim.bz2 file) - get the row of given index, omitting the target
def index2Row(X, index):
    row = X.iloc[index][0][2: -1] # omit the target
    full_row = np.zeros(shape = (1,20958), dtype = "float64")
    splitted_row = row.split(" ")
    
    for s in splitted_row:
        aux = s.split(":")
        if len(aux) == 2:
            full_row[0, int(aux[0]) - 1] = float(aux[1])
    return full_row.T

In [40]:
n_of_features = 20958
n_of_observations = 72309
j = 200

# initial weights

weights = np.ones(shape = (1,n_of_features) ) * 0.01



1

In [94]:
index2Row(df,266)

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

# Solving Linear SVM via Coordinate Descent  (real-sim 72309 x 20958)

In [145]:
# algorithm described in https://jmlr.org/papers/volume9/chang08a/chang08a.pdf


n_of_features = 20958
n_of_observations = 72309


# initial weights

weights = np.ones(shape = (1,n_of_features) ) * 0.01 # initial weights of predictors

# hyperparameters
beta = 0.5
sigma = 0.01
C = 0.01

# outer iterations:
for k in range(1):
    # inner iterations shuld be as many as the number of features
    for i in range(1000):
        print("feature " , i)
        shuffle = np.random.choice(n_of_observations, n_of_observations, replace = False) # shuffling recommended by authors
        col = index2Col(X, i)
        
        # reorder randomly column of X and target (Y):
        col_ = col[shuffle]
        Y_ = np.array(Y.iloc[shuffle])
        
        
        # solving the sub-problem and update weight_i
        
        ### calculating Newton direction d
        
        D_prim_0 = 0
        
        sum_of_squares_of_column = 0
        
        for j in range(n_of_observations// 100):
            # print(j)
            b_j = max(Y_[j]*float(np.dot(weights,index2Row(df,j))),0)
            if b_j > 0:
                D_prim_0 += Y_[j]*col_[j]*b_j
                sum_of_squares_of_column += col_[j]**2
            
            
        D_prim_0 *= (-2*C)
        D_prim_0 += weights[0,i]
                
        D_bis_0 = 1 + 2*C*sum_of_squares_of_column
        d = -D_prim_0/D_bis_0
        
        ### compute lambda = max{1, beta, beta^2, beta^3, ...}
        
        # we do this to ensure sufficient convergence condition & to make calculations quicker
        
        H_i = 1 + 2*C*np.dot(col_.T,col_)
        lambda_ = D_bis_0 /((H_i/2)+sigma)
        
        lambd = 1
        while lambd > lambda_:
            lambd *= beta
            
        # update weights
        weights[0,i] += lambd*d
    print("outer iteration - ", k)




feature  0
feature  1
feature  2
feature  3
feature  4
feature  5
feature  6
feature  7
feature  8
feature  9
feature  10
feature  11
feature  12
feature  13
feature  14
feature  15
feature  16
feature  17
feature  18
feature  19
feature  20
feature  21
feature  22
feature  23
feature  24
feature  25
feature  26
feature  27
feature  28
feature  29
feature  30
feature  31
feature  32
feature  33
feature  34
feature  35
feature  36
feature  37
feature  38
feature  39
feature  40
feature  41
feature  42
feature  43
feature  44
feature  45
feature  46
feature  47
feature  48
feature  49
feature  50
feature  51
feature  52
feature  53
feature  54
feature  55
feature  56
feature  57
feature  58
feature  59
feature  60
feature  61
feature  62
feature  63
feature  64
feature  65
feature  66
feature  67
feature  68
feature  69
feature  70
feature  71
feature  72
feature  73
feature  74
feature  75
feature  76
feature  77
feature  78
feature  79
feature  80
feature  81
feature  82
feature  83
fe

KeyboardInterrupt: 

In [142]:
ind = 1

print(np.dot(weights,index2Row(df,ind)))

# predictions should converge to  1 for 1 class, and -1 for -1 class
# that is, according to loss function  SUM:    (max(0, 1-Y*weight*X))^2

Y.iloc[ind]

[[0.0562166]]


0    1
Name: 1, dtype: int64