In [7]:
# https://pypi.org/project/torch-geometric/
# pip install torch
# pip install torch-scatter -f https://data.pyg.org/whl/torch-1.13.0+cpu.html
# pip install torch-sparse -f https://data.pyg.org/whl/torch-1.13.0+cpu.html
# pip install torch-geometric


# https://towardsdatascience.com/louvains-algorithm-for-community-detection-in-python-95ff7f675306
import torch
from torch import Tensor
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter 

# 一、Data Inclusion
<br>
先將 training data include 進來，可以使用 print 看一下 train_data 長甚麼樣子，基本上就是以下這個 format <p>circleID: friend1 friend2 friend3 ... <p/>

總共有57種 features<br>

In [8]:
import os

# Include Training dataset
target_dir = os.getcwd()+"\Training"

data = []
for filename in os.listdir(target_dir):
    with open(os.path.join(target_dir, filename), 'r') as f:
        data += f.readlines()

train_data = []
for i in range(len(data)):
    train_data.append(data[i].split()[1:]) # circle* is seems not nesseary

print(f"number of trainin data: {len(train_data)}")
    
    
    
# Total is 57 kinds of features

with open("featureList.txt", 'r') as f:
    feature_list = f.readlines()

for i in range(len(feature_list)):
    feature_list[i] = feature_list[i].replace('\n', '')

print(f"number of features : {len(feature_list)}")


number of trainin data: 592
number of features : 57


# 二、Data Preprocessing

之後可以在這裡做一些 feature selection 之類的、或者視覺化<br>
有一些 features 是 NaN，可以做一些處理

舉例:   
26321.circles 和 25773.circles 都有circle 10分別是  
26321.circles -> circle10: 26379 26455 26482 26340 26409 26366 26390 26477 26415 26462 26402  
25773.circles -> circle10: 25787 25924 25947 25836 25817 25857 25902 25815 25821 25774 25961 ...
  
應該要將同樣 circle 的 user 加在一起，因為他們都是同一個 circle  
  
在沒這樣做之前，circle會重複，所以會誤以為是 592個，但實際上應該是455個才對

In [9]:
# 先找出重複的circle有幾個，再針對他們做合併，應該比較快，就不用一個一個還要重複掃
tes = []
for i in range(len(data)):
    tes.append(data[i].split(':')[0])

a = Counter(tes).most_common()[:46] # 有重複的總共46個
b = []
for i in range(len(a)):
    b.append(a[i][0])
print(f"Show the fist 5 elements: {b[:5]}")
print(f"Actually circle numbers: {len(Counter(tes))} not {len(tes)}")

Show the fist 5 elements: ['circle13', 'circle18', 'circle62', 'circle10', 'circle74']
Actually circle numbers: 455 not 592


In [10]:
temp_a = []

for target in b:
    temp = []
    for i in range(len(data)):
        circle_name = data[i].split(':')[:1][0]
        if circle_name == target:
            temp += data[i].split()[1:]

    temp_a.append(temp)

for i in range(len(data)):
    circle_name = data[i].split(':')[:1][0]
    
    if circle_name not in b: #如果 circle name 沒有重複的名單中，加入到list中
        temp_a.append(data[i].split()[1:])
        
print(f"Examine if the number is correct : {len(temp_a)}. The answer is yes, we eliminate repeated circles.")

Examine if the number is correct : 455. The answer is yes, we eliminate repeated circles.


#### label每個node

In [11]:
# initialization every user's label

def label_data(x, circle): # x: number of dataset, circle: temp_a
    y_label = [0] * x #27520

    for i in range(len(circle)):
        for item in circle[i]:
            y_label[int(item)] = i+1
    return y_label

# 預想是說一個 user 應該會對應到一個 circle
# 但 0 特別多，代表有很多 user 沒有標到
# 下面 aaaa 是在算 train_data 中所有的 user，train_data代表將所有.circles的檔案合成一個list
# print(len(y_label)-Counter(y_label).most_common()[0][1])

aaaa = []
for i in train_data:
    aaaa+=i
#print(len(Counter(aaaa)))

#### Include 每個 Node 對應的 features，並且把它包成 DataFrame，但後續還需要做處理...<br>

In [15]:
# Include each user's features 
    
fea_dir = "features\\features.txt"
features = []

with open(fea_dir, 'r') as f:
    features += f.readlines()

a = [] 

for i in range(len(features)):
    # split each user features by space
    a.append(features[i].split()) 

final = []

# make a dict that key is feature and value is corresponeding value
for j in range(len(a)):
    b_col = []
    b_row = []
    for i in range(len(a[j][1:])):
        b_col.append(a[j][1:][i][::-1].split(';', 1)[1][::-1])
        b_row.append(a[j][1:][i].split(';')[-1])

    final.append(dict(zip(b_col, b_row)))
    
df = pd.DataFrame(final)
print(f"number of user:  {len(df)}")
#df # there are many NaN
df_new = df.fillna(-1)

df_new

number of user:  27520


Unnamed: 0,last_name,first_name,birthday,name,gender,locale,hometown;name,hometown;id,education;school;name,education;school;id,...,work;projects;from;name,work;projects;from;id,education;classes;description,work;from;name,work;from;id,political,religion,work;projects;end_date,work;projects;description,location
0,0,0,0,0,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1,1,1,1,1,0,1,1,3,3,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,2,2,2,2,1,0,2,2,4,4,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,3,3,3,3,1,0,3,3,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,4,4,4,4,0,0,4,4,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27515,4416,3040,4794,27280,1,1,-1,-1,4174,22356,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
27516,17149,2944,-1,27281,0,0,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
27517,17150,470,-1,27282,1,1,133,133,395,405,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
27518,17151,3002,3531,27283,0,8,4299,4313,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [16]:
y_label = label_data(len(df), temp_a)

In [17]:
# 刪除資料集沒有circle的user
abc = df_new.copy()

# assign "label" as a feature in dataframe
abc = abc.assign(label = y_label)

# drop the rows if value in label is zero, and direct delete data
abc.drop(abc[abc.label==0].index, inplace=True)

In [18]:
def select_data(n): # select the dataframe based on its label frequency
    top_n = Counter(abc['label']).most_common()[:n]
    top_n_label = [top_n[i][0] for i in range(len(top_n))]
    
    options = top_n_label
    rslt_df = abc[abc['label'].isin(options)] # select features which is in top_n_label
    return rslt_df

In [19]:
rslt_df50 = select_data(50) # choose user whose belonging circle is top 50 in all circle (total 455)
rslt_df100 = select_data(100)
rslt_df150 = select_data(150)

# 三、Model

### For scoring

In [23]:
#https://ithelp.ithome.com.tw/m/articles/10265920

In [30]:
# scoring = [''...]
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score

models = [DecisionTreeClassifier(),GaussianNB(),  SVC()]
names = ["Decision Tree"] # SVM 太慢又不準，先刪除，Naive Bayes 一點都不準

def model(df_new, y_label_new, n_splits = 10, shuffle=True):

    X, y = np.array(df_new), np.array(y_label_new) 
    k_folds = KFold(n_splits = n_splits, shuffle=True, random_state=True)
    
    for model, name in zip(models, names):
        print(name)
        for score in ["accuracy", "recall_micro", "precision_micro"]:
            print(score + ": " , end ="")
            
            # 下面不太準，因為每個 score 又重新訓練一次，可能要優化一下，可是我懶得優化了
            print("{:.2%}".format((cross_val_score(model, X, y,scoring=score, cv=k_folds).mean())))
        print()


In [31]:
model(rslt_df50[rslt_df50.columns[:-1]], rslt_df50['label'])

Decision Tree
accuracy: 66.38%
recall_micro: 66.14%
precision_micro: 66.34%

