<center> <font color="blue"><h1> French Web Domain Classification </h1> </center>


<center> <h3> Done by :Saif Eddine GHRIBI & Mohamed Skander HELLAL & Ramzi Charradi </h3> </center>

### Imports

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)
import networkx as nx
import torch
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from src.preprocess import read_data, pre_process_text, filter_text
from src.GCN import GCN,GAT,train
from src.appnptrain import APPNPTrainer

### Setting

In [4]:
# path of files to read :
train_data = "data/train_noduplicates.csv"
test_data  = "data/test.csv"
texts_path = "data/text/text"

# save files to :
train_preprocessed = "saved_files/train_graph.csv"
test_preprocessed = "saved_files/test_graph.csv"

# preprocessing :
num_words = 2000
do_stem = False
do_tokenize = False

# text filters
min_word_length = 4
min_text_length = 20

# k_core decomposition
do_kcore = False

### Preprocess Data

In [None]:
# read data
import time
start = time.time()
df, test_df = read_data(train_data, test_data, texts_path)
end = time.time()
# drop empty rows
df = df.dropna()

In [None]:
# preprocess data
tqdm.pandas()
print("Preprocessing training data...")
df["text"] = df["text"].progress_apply( lambda x : pre_process_text(x, num_words, do_stem, do_tokenize))
print("Preprocessing test data...")
test_df["text"] = test_df["text"].progress_apply( lambda x : pre_process_text(x, num_words, do_stem, do_tokenize))

In [None]:
# filtering data
tqdm.pandas()
print("Filtering training data...")
df["text"] = df["text"].progress_apply( lambda x : filter_text(x, min_word_length = 4 , min_text_length = 15,dataset = "train"))
df = df[df.text!="No text"]
df = df.reset_index(drop=True)
print("Filtering test data...")
test_df["text"] = test_df["text"].progress_apply( lambda x : filter_text(x, min_word_length = 4 , min_text_length = 0,dataset ="test"))

In [None]:
# Create a directed, weighted graph
graph = nx.read_weighted_edgelist('data/edgelist.txt',create_using=nx.DiGraph())    
graph.remove_edges_from(nx.selfloop_edges(graph))
print(graph.number_of_nodes())
print(graph.number_of_edges())

In [None]:
train_nodes = df.node.values.tolist()
train_nodes = [str(node) for node in train_nodes]
nodelist = list(df.node.apply(lambda x : str(x)).values)
graph = graph.subgraph(train_nodes)
adj = nx.adjacency_matrix(graph,nodelist)
adj = torch.FloatTensor(np.array(adj.todense()))

In [None]:
vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', encoding='latin-1'
                       , min_df=10, max_df=1000,max_features = 3000)
features = vec.fit_transform(df.text)
features = torch.FloatTensor(features.toarray())

In [None]:
labels = df.label
labels = torch.LongTensor(labels)

### Graph Convolutional Networks

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if use_gpu:
    torch.cuda.manual_seed(42)

epochs = 20
model, optimizer = None, None

model = GCN(
            nfeat = features.shape[1],
            nhid = 64,
            nclass = 8,
            dropout = 0.5,
            init = 'xavier'
    )

optimizer = optim.SGD(
            model.parameters(),
            lr = 5e-3,
            weight_decay = 5e-4,
            momentum = 0.9
    )
model.cuda()
features = features.cuda()
adj = adj.cuda()
labels = labels.cuda()

In [None]:
for epoch in range(epochs) : 
    train(model,optimizer,features,adj,labels,epoch)

### Graph Attention Networks

In [None]:
model, optimizer = None, None

model = GAT(
            nfeat = features.shape[1],
            nhid = 64,
            nclass = 8,
            dropout = 0.5,
            alpha = 0.2,
            nheads = 8
    )

optimizer = optim.SGD(
            model.parameters(),
            lr = 5e-3,
            weight_decay = 5e-4,
            momentum = 0.9
    )

model.cuda()
features = features.cuda()
adj = adj.cuda()
labels = labels.cuda()

In [None]:
for epoch in range(epochs) : 
    train(model,optimizer,features,adj,labels,epoch)

### APPNP

In [None]:
# parameters
model = "exact"
epochs =2000
seed = 42
iterations = 10
early_stopping_rounds = 1000
train_size = 1500
dropout = 0.5
alpha  =0.1
learning_rate =0.01
lambd = 0.005
layers = [64, 64]

In [None]:
# features dictionnary
feature_dict = dict()
for i in range(adj.shape[0]):
    l =  features[i].tolist()
    feature_dict[df.node[i]] = [ j for j in range(len(l)) if l[j]!=0]
# Target
target = df.label

In [None]:
appnp = APPNPTrainer(graph,feature,target,model,layers,dropout,iterations,alpha
                    ,train_size,lambd,learning_rate,epochs,early_stopping_rounds)
appnp.fit()