# COMP90051 Project 1

##### Author: Tingsheng Lai 781319 (Tinson)

##### Date Created: 19/08/2018

## Import

In [7]:
import math
import numpy as np
import pandas as pd
import random
from gensim.models import Word2Vec
from networkx import DiGraph
from networkx.algorithms.link_analysis.pagerank_alg import pagerank
from time import time

In [2]:
start = time()
G = DiGraph()
with open("train.txt", 'r') as f:
    for line in f.read().split('\n'):
        if line:
            line = line.split()
            src = line[0]
            G.add_edges_from((src, dest) for dest in line[1:])
print(f"---- Read Finish in {time() - start:.2f}s ----")

start = time()
prs = pagerank(G)
print(f"---- PageRank Finish in {time() - start:.2f}s ----")

start = time()
model = Word2Vec.load("node2vec.model")
model1 = Word2Vec.load("node2vec-1.model")
model2 = Word2Vec.load("node2vec-2.model")
print(f"---- Word2Vec Model Loaded in {time() - start:.2f}s ----")

---- Read Finish in 95.65s ----
---- PageRank Finish in 243.10s ----
---- Word2Vec Model Loaded in 248.90s ----


In [14]:
K = 20000


def generate_features(src, dest):
    nsrc = set(G.successors(src))
    ndest = set(G.predecessors(dest))
    cn = nsrc.intersection(ndest)
    lcn = len(cn)
    return np.hstack((
        model.wv.get_vector(src) + model.wv.get_vector(dest),
        model1.wv.get_vector(src) + model1.wv.get_vector(dest),
        model2.wv.get_vector(src) + model2.wv.get_vector(dest)
    ))


start = time()
data = [[*generate_features(*edge), 1] for edge in random.sample(G.edges, K)]
nodes = list(G.nodes)


def sample_discon_edge():
    src, dest = random.choice(nodes), random.choice(nodes)
    while G.has_edge(src, dest) or src == dest:
        dest = random.choice(nodes)
    # may generate duplicates
    return src, dest


data.extend([
    [*generate_features(*sample_discon_edge()), 0]
    for _ in range(K)
])
data = np.array(data)
np.random.shuffle(data)
print(f"---- Sampling Finish in {time() - start:.2f}s ----")

---- Sampling Finish in 235.13s ----


In [15]:
from sklearn.linear_model import SGDRegressor

start = time()
regressor = SGDRegressor()
regressor.fit(data[:, :-1], data[:, -1])
print(f"---- Train Finish in {time() - start:.2f}s ----")

start = time()
ids, scores = [], []
with open("test-public.txt") as f:
    lines = f.read().split('\n')
    for line in lines[1:]:
        if line:
            pid, src, dest = list(line.split())
            ids.append(pid)
            print(f"{pid}: {src} -> {dest}")
            s = regressor.predict([generate_features(src, dest)])[0]
            if s < 0:
                s = 0
            elif s > 1:
                s = 1
            print(f"Logistic Regressor: {s}")
            scores.append(s)
print(f"---- Process Finish in {time() - start:.2f}s ----")

s = pd.Series(scores, ids)
s.name = "Prediction"
s.index.name = "Id"
s.to_csv("output-smaller.csv", header=True)



---- Train Finish in 0.34s ----
1: 2184483 -> 1300190
Logistic Regressor: 0.6487328855549095
2: 3151356 -> 1452193
Logistic Regressor: 0.9656618828455459
3: 1579396 -> 193159
Logistic Regressor: 0.5723697719846922
4: 1406432 -> 2481036
Logistic Regressor: 0.7156179088501029
5: 2389638 -> 593017
Logistic Regressor: 0.9642566364624918
6: 228206 -> 212805
Logistic Regressor: 1
7: 1237964 -> 879115
Logistic Regressor: 0.7111702044043237
8: 3318124 -> 1840575
Logistic Regressor: 0.6271984871111861
9: 4522929 -> 1552625
Logistic Regressor: 0.5417744016791789
10: 3406737 -> 3781412
Logistic Regressor: 0.6328263788467672
11: 1567983 -> 3008412
Logistic Regressor: 0.567448117459311
12: 4370932 -> 2906415
Logistic Regressor: 0.5834911847090332
13: 286240 -> 474307
Logistic Regressor: 0.8437411391504017
14: 4796773 -> 3115614
Logistic Regressor: 0.7612443448226909
15: 2207765 -> 4354226
Logistic Regressor: 1
16: 1283384 -> 279636
Logistic Regressor: 0.9495401943558655
17: 368544 -> 2431466
Logist