In [45]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict

In [46]:
def readDocument(fileName):
    X = []
    y = []
    with open(fileName, 'r') as file:   
        for row in file:
            dataset = row.split('\t')
            y.append(dataset[0])
            if fileName == 'train.dat':
                X.append(dataset[1])
        return X, y
trainX,trainY = readDocument('train.dat');
testX = readDocument('test.dat')[1];
print(len(trainX))
print(len(trainY))
print(len(testX))

800
800
350


In [47]:
# Data Pre-Processing Coo matrix --> CSC Matrix --> CSR Matrix
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
import itertools
NUMBER_OF_PARAMETERS = 1000000 # max column index in training data

def build_csc(lists):
    i = 0
    param_lists = []
    row_lists = []
    value_lists = []
    for list in lists:
        list = list.strip()
        params = [int(n) for n in list.split(' ')] # list with string to list of nums, index is +1
        row = [i] * len(params)
        value = [True] * len(params)
        param_lists.append(params)
        row_lists.append(row)
        value_lists.append(value)
        i += 1
    coo = create_coo(param_lists, row_lists, value_lists, i)
    return csc_matrix(coo)

def create_coo(param_lists, row_lists, value_lists, num_rows):
    # in create COO
    flattened_params = np.array(list(itertools.chain.from_iterable(param_lists)))
    flattened_rows = np.array(list(itertools.chain.from_iterable(row_lists)))
    flattened_values = np.array(list(itertools.chain.from_iterable(value_lists)))
    sparse_coo = coo_matrix((flattened_values, (flattened_rows, flattened_params)), #three 1D lists
                            shape=(num_rows, NUMBER_OF_PARAMETERS+1), #size of matrix, +1 bc of indexing,
                            dtype=np.bool)  # creates a boolean compressed sparse row matrix
    return sparse_coo

In [48]:
# Create CSR matrix of train
train_csc = build_csc(trainX)
train_csr = csc_matrix.tocsr(train_csc);

In [49]:
# l2 normalize train data
from sklearn.preprocessing import normalize
normTrain = normalize(train_csr);

In [50]:
# Applying Dimesionality Reduction on train
from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components=8, random_state=42)
XTrain = pca.fit_transform(normTrain.toarray());
print(len(XTrain));

800


In [51]:
# Create CSR matrix of test
test_csc = build_csc(testX)
test_csr = csc_matrix.tocsr(test_csc);

In [52]:
# l2 normalize test data
from sklearn.preprocessing import normalize
normTest = normalize(test_csr);

In [53]:
# Applying Dimesionality Reduction on test
XTest = pca.transform(normTest.toarray());

In [54]:
# classify test data
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()
clf = clf.fit(XTrain, trainY);
clf.feature_importances_

predication = []

prediction = clf.predict(XTest)
print(len(prediction))
print(prediction)

350
['0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0

In [55]:
# write the predictions in file
output_file = open('data/out_trucatedSVD_Extra-Trees_8_42.dat', 'w')
row_count = 0
one_count = 0
for n in prediction:
    if n == '1':
        one_count+=1
        output_file.write('1\n')
    else:
        output_file.write('0\n')
    row_count+=1
print(row_count)
print(one_count)
output_file.close();

350
8


In [56]:
#Verify file written
with open("data/out_trucatedSVD_Extra-Trees_8_42.dat", "r") as output:
    output_data_lines = output.readlines()
print(len(output_data_lines))

350
