In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.linear_model import Lasso
from gensim.models.doc2vec import Doc2Vec

In [None]:
# read training data
df_train = pd.read_csv('data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]
print("n_train = ", n_train)

# read test data
df_test = pd.read_csv('data/test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]
print("n_test = ", n_test)

# load the graph    
G = nx.read_edgelist('data/coauthorship.edgelist', delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

In [None]:
def buildMatrices():

    # computes structural features for each node
    core_number = nx.core_number(G)

    # create the training matrix. each node is represented as a vector of 3 features:
    # (1) its degree, (2) its core number 
    X_train = np.zeros((n_train, 2))
    y_train = np.zeros(n_train)
    for i,row in df_train.iterrows():
        node = row['author']
        X_train[i,0] = G.degree(node)
        X_train[i,1] = core_number[node]
        y_train[i] = row['hindex']

    # create the test matrix. each node is represented as a vector of 3 features:
    # (1) its degree, (2) its core number
    X_test = np.zeros((n_test, 2))
    for i,row in df_test.iterrows():
        node = row['author']
        X_test[i,0] = G.degree(node)
        X_test[i,1] = core_number[node]
    
    return X_train,y_train,X_test


In [None]:
X_train,y_train,X_test = buildMatrices()

In [1]:
def invertedIndex_to_txt(line):
    splited = line.split('----')
    id = splited[0]

    splited = splited[1].split('{',2)
    inverted_index = splited[2].split('"')
    print(inverted_index[1::2])
    return id,inverted_index[1::2]

def clean(list_of_words):
    for word in list_of_words:
        word = word.replace('.','')
        word = word.replace('\r','')
        word = word.replace('\n','')
        word = word.lower()
    return list_of_words

def build_abstracts_words():
    abstracts = open('data/abstracts.txt','r')
    lines = abstracts.readlines()
    abstracts_words = open('data/abstracts_words.txt','w')

    for line in lines:
        abstract_id, words = invertedIndex_to_txt(line)
        words = clean(words)
        txt = ' '.join(words)
        abstracts_words.write(abstract_id+" : "+txt+"\n")
    abstracts_words.close()
    abstracts.close()

build_abstracts_words()

In [None]:
def LassoReg(alpha=0.1):
    # train a regression model and make predictions
    reg = Lasso(alpha=0.1)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    
    # write the predictions to file
    df_test['hindex'] = pd.Series(np.round_(y_pred, decimals=3))
    df_test.loc[:,["author","hindex"]].to_csv('submission.csv', index=False)