In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [64]:
%%capture
!pip install optuna

In [1]:
import csv
import networkx as nx
import numpy as np
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.model_selection import cross_val_score, KFold


In [2]:
# file_path = os.path.join(os.path.expanduser("~"), "Desktop", "data_challenge", "train.txt")

# read training data
train_domains = list()
y_train = list()
# with open("/content/drive/Othercomputers/My Computer/Masters_Staff/trimester_3/Data_challenge/project/data/train.txt", 'r') as f:
with open("../../../../../data_challenge/train.txt", 'r') as f:    
    for line in f:
        l = line.split(',') # domain names
        train_domains.append(l[0])
        y_train.append(l[1][:-1]) # topics of domain names

# read test data
test_domains = list()
# with open("/content/drive/Othercomputers/My Computer/Masters_Staff/trimester_3/Data_challenge/project/data/test.txt", 'r') as f:
with open("../../../../../data_challenge/test.txt", 'r') as f:    
    for line in f:
        l = line.split(',')
        test_domains.append(l[0])

# create a directed graph
# G = nx.read_edgelist('/content/drive/Othercomputers/My Computer/Masters_Staff/trimester_3/Data_challenge/project/data/edgelist.txt', delimiter=' ',
#                       create_using=nx.DiGraph())

G = nx.read_edgelist('../../../../../data_challenge/edgelist.txt', delimiter=' ',
                      create_using=nx.DiGraph())

print('Number of nodes:', G.number_of_nodes())
print('Number of edges:', G.number_of_edges())

Number of nodes: 65208
Number of edges: 1642073


## Read textual data from domains

In [3]:
import csv
import re
import zipfile
from io import BytesIO
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# read textual content of webpages of domain names
text = dict()
# with zipfile.ZipFile('/content/drive/Othercomputers/My Computer/Masters_Staff/trimester_3/Data_challenge/project/data/domains.zip', "r") as zfile:
    
with zipfile.ZipFile('../../../../../data_challenge/domains.zip', "r") as zfile:
    for filename in zfile.namelist():
        if re.search(r'\.zip$', filename) is not None:
            zfiledata = BytesIO(zfile.read(filename))
            with zipfile.ZipFile(zfiledata) as zfile2:
                text[filename[:-4]] = ''
                for name2 in zfile2.namelist():
                    file = zfile2.read(name2)
                    text[filename[:-4]] += file.decode('utf16') + ' '

# retrieve textual content of domain names of the training set
train_data = list()
for domain in train_domains:
    if domain in text:
        train_data.append(text[domain])
    else:
        train_data.append('')

# retrieve textual content of domain names of the test set
test_data = list()
for domain in test_domains:
    if domain in text:
        test_data.append(text[domain])
    else:
        test_data.append('')

# to reduce memory
text = None

### Degree Centrality
Degree centrality is a measure of the importance of a node within a network. It is simply the number of edges connected to a node, normalized by the maximum possible degree of the node

In [4]:
deg_central = nx.degree_centrality(G)
for key, value in list(deg_central.items())[:5]:
    print(key, ":", value)

blog.com.gr : 0.0007821246185225512
fmvoice.gr : 0.0038952873157789805
papakishop.gr : 0.0012421979235358166
rizospastis.gr : 0.032496511110770315
taxheaven.gr : 0.022436241507813575


In [5]:
sorted(deg_central.items(), key= lambda kv: -kv[1]) # sort backwards

[('pblogs.gr', 0.18373794224546444),
 ('sch.gr', 0.1619611391415032),
 ('google.gr', 0.1267041882006533),
 ('in.gr', 0.12066189212814574),
 ('ingreece24.gr', 0.12032450503780268),
 ('snn.gr', 0.10701305074608554),
 ('tovima.gr', 0.10293373410830124),
 ('kathimerini.gr', 0.10028064471605809),
 ('ethnos.gr', 0.09597129142576716),
 ('tanea.gr', 0.08124894566534267),
 ('protothema.gr', 0.08052816415415522),
 ('bizdirectory.gr', 0.07723097213489348),
 ('uoa.gr', 0.07476191206465563),
 ('enet.gr', 0.07460855429631788),
 ('in2life.gr', 0.07437851764381125),
 ('madata.gr', 0.07333568481911451),
 ('urlj.gr', 0.07223150888708267),
 ('lifo.gr', 0.0702071863450243),
 ('news247.gr', 0.07006916435352033),
 ('iefimerida.gr', 0.06970110570950971),
 ('skai.gr', 0.06899565997515604),
 ('naftemporiki.gr', 0.06853558667014277),
 ('zougla.gr', 0.06738540340760961),
 ('ert.gr', 0.06364347386016839),
 ('otenet.gr', 0.06315272900148756),
 ('auth.gr', 0.06237060438296502),
 ('newsbeast.gr', 0.06151180088027359

In [6]:
print(y_train[:5]) # topics of each of the labeled domain namess
print(test_domains[:1])
print(train_domains[:1])

['0', '5', '5', '0', '3']
['startupper.gr']
['autocarnet.gr']


In [7]:
from sklearn.model_selection import train_test_split

# create the training matrix. Each row corresponds to a web host.
# use the following 3 features for each web host:
# (1) int-degree of node
# (2) out-degree of node
# (3) average degree of neighborhood of node
X_train = np.zeros((len(train_domains), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_domains)
for i in range(len(train_domains)):
    X_train[i,0] = G.in_degree(train_domains[i])
    X_train[i,1] = G.out_degree(train_domains[i])
    X_train[i,2] = avg_neig_deg[train_domains[i]]
    # X_train[i,3] = round(deg_central_train[train_domains[i]], 5)

# create the test matrix. Use the same 3 features as above
Xtest = np.zeros((len(test_domains), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_domains)
for i in range(len(test_domains)):
    Xtest[i,0] = G.in_degree(test_domains[i])
    Xtest[i,1] = G.out_degree(test_domains[i])
    Xtest[i,2] = avg_neig_deg[test_domains[i]]

#### Create dataframes contain graph and text info

In [8]:
import pandas as pd
trains = pd.DataFrame({'domain_name':train_domains, 'in_deg':X_train[:,0], 'out_deg':X_train[:,1], 'avg_neig_deg':X_train[:,2], 'text':train_data, 'target':y_train})
tests = pd.DataFrame({'domain_name':test_domains, 'in_deg':Xtest[:, 0], 'out_deg':Xtest[:,1], 'avg_neig_deg':Xtest[:, 2], 'text':test_data})

#### Preprocessing


In [9]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# initialize objects
stemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('greek'))

def preprocess(contents):
  docs = []

  for doc in contents:

      document = re.sub(r'\W', ' ', str(doc))# remove non-word (special) characters such as punctuation, numbers etc
      document = re.sub(r'\s+br\s+',' ', str(document)) # remove HTML <BR>
      document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # remove all single characters
      document = re.sub(r'\b\d+\b', ' ', document) # remove numbers
      document = re.sub(r'\s+', ' ', document, flags=re.I) #re.I -> ignore case  and substitute multiple spaces with single space
      document = document.lower() # convert to Lowercase
      word_list = word_tokenize(document)# split the document based on whitespaces (--> List of words)
      word_list = [word for word in word_list if word not in (stop_words)]
      # Lemmatization
      # word_list = [stemmer.lemmatize(word) for word in word_list]
      document = ' '.join(word_list) # reconstruct the document by joining the words on each whitespace
      docs.append(document) # append all documents into a list 'docs'
      
  return pd.Series(docs)




[nltk_data] Downloading package punkt to /home/meizeus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/meizeus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/meizeus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# preprocess texts
trains['text'] = preprocess(trains['text'])
tests['text'] = preprocess(tests['text'])

In [58]:
tests.shape

(605, 5)

#### Split data

### TOKENIZATION + NaN-removal

In [11]:
from nltk.tokenize import word_tokenize
trains['tokens']  = trains['text'].apply(lambda x: word_tokenize(x, language='greek'))
trains['tokens'][:2]

# filter out rows with empty lists in the 'tokens' column
trains = trains[trains['tokens'].apply(lambda x: len(x) > 0)]
trains = trains.reset_index(drop=True) # reset index of the filtered DataFrame
display(trains.info())
trains.sample()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1506 entries, 0 to 1505
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   domain_name   1506 non-null   object 
 1   in_deg        1506 non-null   float64
 2   out_deg       1506 non-null   float64
 3   avg_neig_deg  1506 non-null   float64
 4   text          1506 non-null   object 
 5   target        1506 non-null   object 
 6   tokens        1506 non-null   object 
dtypes: float64(3), object(4)
memory usage: 82.5+ KB


None

Unnamed: 0,domain_name,in_deg,out_deg,avg_neig_deg,text,target,tokens
1038,egnomi.gr,240.0,4.0,200.75,http www egnomi gr article kypello_eyboias_sfy...,3,"[http, www, egnomi, gr, article, kypello_eyboi..."


#### Train - Dev split

In [12]:
from sklearn.model_selection import train_test_split

Xsplit = trains.drop(['target'], axis=1)

# split data to train and dev
Xtrain, Xdev, ytrain, ydev = train_test_split(
    Xsplit,
    trains['target'],
    test_size=0.10, random_state=42)

print("Train matrix dimensionality: ", Xtrain.shape)
print("Dev matrix dimensionality: ", Xdev.shape)
print("Test matrix dimensionality: ", Xtest.shape)  # This line has an issue


Train matrix dimensionality:  (1355, 6)
Dev matrix dimensionality:  (151, 6)
Test matrix dimensionality:  (605, 3)


### Imbalanced Dataset


**classes {0,7,6,4} are significantly smaller that majority class (class-3) and class-1 is rougly 25% of majority class**

In [None]:
from collections import Counter
import seaborn as sns

c_counts = Counter(y_train)
display(c_counts)

Counter({'3': 548,
         '5': 280,
         '2': 263,
         '8': 189,
         '1': 140,
         '4': 113,
         '7': 99,
         '6': 98,
         '0': 82})

#### Encode targets (for NNs etc.)

In [21]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)

ytrain_en = encoder.fit_transform(np.array(ytrain).reshape(-1, 1)) # reshape y_train
ydev_en = encoder.fit_transform(np.array(ydev).reshape(-1, 1)) # reshape as y_dev

print("\tEncoded y_dev:")
print(ydev_en[:2])
print("\n")
print("\tEncoded y_dev:")
print(ytrain_en[:2])

	Encoded y_dev:
[[0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]]


	Encoded y_dev:
[[0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]]


### Fasttext

In [13]:
import fasttext
import gzip

# m_path = '../../../../../big_files_DS/cc.el.300.bin.gz'
# with gzip.open(m_path, 'rb') as f:
#     model_data = f.read()
# ft_model = fasttext.load_model(model_data)

ft_model = fasttext.load_model('../../../../../big_files_DS/cc.el.300.bin');



In [15]:
# func to use fasttext to use vectorize text
def ft_vectorization(X_text):
    X_vec = []

    for index, row in X_text.iterrows():
        text = row['text']
        vector = ft_model.get_sentence_vector(text) # row is sentence
        X_vec.append(vector)

    return np.array(X_vec)

# vectorize each text set
Xtrain_vec = ft_vectorization(Xtrain)
Xdev_vec = ft_vectorization(Xdev)
Xtest_vec = ft_vectorization(tests)

np.array(Xtrain_vec).shape

(1355, 300)

### SMOTE

In [None]:
# %%capture
# %pip install imblearn

In [None]:
# example implementation of SOTE technique
from imblearn.over_sampling import SMOTE

method = SMOTE(k_neighbors = 3)
Xsmote, ysmote = method.fit_resample(Xtrain_vec, ytrain['target'])

print(Xsmote.shape,ysmote.shape) # new shape

(3591, 300) (3591,)


### TF-IDF vectorization

In [16]:
# create the training matrix. Each row corresponds to a domain name and each column to a word present in at least 10 webpages
# and at most 50 webpages of domain names. The value of each entry in a row is equal to the tf-idf weight of that word in the
# corresponding domain
vec = TfidfVectorizer(ngram_range=(1, 3),
                      decode_error='ignore',
                      max_features=5000,
                      strip_accents='unicode',
                      min_df=10, max_df=50)

# vectorize the text sets
Xtrain_tf = vec.fit_transform(Xtrain['text'])
Xdev_tf = vec.transform(Xdev['text'])
Xtest_tf = vec.transform(tests['text'])

print("Train matrix dimensionality: ", Xtrain_tf.shape)
print("Dev matrix dimensionality: ", Xdev_tf.shape)
print("Test matrix dimensionality: ", Xtest_tf.shape)

Train matrix dimensionality:  (1355, 5000)
Dev matrix dimensionality:  (151, 5000)
Test matrix dimensionality:  (605, 5000)


### SVD for dimensionality reduction

In [None]:
# reduce dimensionality using SVD
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=400, random_state=4321)
Xtrain_svd = svd.fit_transform(Xtrain_tf)
Xdev_svd = svd.transform(Xdev_tf)
Xtest_svd = svd.transform(Xtest_tf)

print(Xtrain_svd.shape, type(Xtrain_svd))
print(Xdev_svd.shape, type(Xdev_svd))
print(Xtest_svd.shape, type(Xtest_svd))



<h2 align='center'> Text - Logistic Regression (LR)</h2>


In [18]:
import optuna
def objective(trial, X, y):
    # define hyperparameters to optimize
    C = trial.suggest_float('C', 0.01, 100, log=True)
    penalty = trial.suggest_categorical('penalty', ['l2'])
    max_iter = trial.suggest_int('max_iter', 1000, 5000)

    model = LogisticRegression(C=C, penalty=penalty, max_iter=max_iter, solver='liblinear')

    # perform cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

    # return average accuracy as the objective value
    return scores.mean()

# begin studies
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, Xtrain_tf, ytrain), n_trials=30)

best_params = study.best_params
best_score = study.best_value

print("Best parameters:", best_params)

print("Best score:", best_score)


[I 2024-04-23 22:09:37,059] A new study created in memory with name: no-name-a5c331c9-3fea-4b07-86ec-6e242e9cac52
[I 2024-04-23 22:09:37,277] Trial 0 finished with value: 0.30405904059040595 and parameters: {'C': 0.02485919763102707, 'penalty': 'l2', 'max_iter': 1483}. Best is trial 0 with value: 0.30405904059040595.
[I 2024-04-23 22:09:37,872] Trial 1 finished with value: 0.5232472324723247 and parameters: {'C': 4.264802537740301, 'penalty': 'l2', 'max_iter': 2363}. Best is trial 1 with value: 0.5232472324723247.
[I 2024-04-23 22:09:38,119] Trial 2 finished with value: 0.30405904059040595 and parameters: {'C': 0.019443031152750385, 'penalty': 'l2', 'max_iter': 3672}. Best is trial 1 with value: 0.5232472324723247.
[I 2024-04-23 22:09:38,392] Trial 3 finished with value: 0.30774907749077485 and parameters: {'C': 0.153749841417026, 'penalty': 'l2', 'max_iter': 1396}. Best is trial 1 with value: 0.5232472324723247.
[I 2024-04-23 22:09:39,085] Trial 4 finished with value: 0.52619926199261

Best parameters: {'C': 38.01489671195558, 'penalty': 'l2', 'max_iter': 4741}
Best score: 0.5394833948339484


In [22]:
from sklearn.metrics import classification_report

# simple LR training && prediction(dev & test)
lr = LogisticRegression(max_iter=1500)
lr.fit(Xtrain_tf, ytrain)
ydev_lr = lr.predict_proba(Xdev_tf)
ytest_lr = lr.predict_proba(Xtest_tf)

# cross-val LR training && prediction (dev & test)
lr_cv = LogisticRegression(**best_params)
lr_cv.fit(Xtrain_tf, ytrain)

ydev_lrcv = lr_cv.predict_proba(Xdev_tf) # predict on dev
ytest_lrcv = lr_cv.predict_proba(Xtest_tf) # predict on test

In [23]:
# the dev set is -0.15 lower than test set on kaggle

# cross-entropy loss for multi-class classification
def multiclass_cross_entropy(y_true, y_pred_prob):
    epsilon = 1e-15  # small value to prevent log(0)
    # clip predicted probabilities to avoid log(0)
    y_pred_prob = np.clip(y_pred_prob, epsilon, 1 - epsilon)
    # compute cross-entropy loss
    loss = -np.mean(np.sum(y_true * np.log(y_pred_prob), axis=1))
    return loss

cross_entropy_loss1 = multiclass_cross_entropy(ydev_en, ydev_lr)
cross_entropy_loss2 = multiclass_cross_entropy(ydev_en, ydev_lrcv)

print(f"Cross Entropy loss for Simple LR: {cross_entropy_loss1}")
print(f"Cross Entropy loss for Cross-Val LR: {cross_entropy_loss2}")


Cross Entropy loss for Simple LR: 1.563823633828987
Cross Entropy loss for Cross-Val LR: 1.384521027088947


<h2 align='center'> Text - Support Vector Machines (SVM)</h2>


In [None]:
from sklearn.svm import SVC

def objective(trial, X, y):
    # Define hyperparameters to optimize
    C = trial.suggest_float('C', 0.01, 10, log=True)
    kernel = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])
    gamma = trial.suggest_float('gamma', 0.01, 100, log=True)

    # Create SVM model with hyperparameters
    model = SVC(C=C, kernel=kernel, gamma=gamma)

    # Perform cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

    # Return average accuracy as the objective value
    return scores.mean()

# Assuming X_train and y_train are defined elsewhere
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, Xtrain_tf, ytrain), n_trials=40)

# Get the best parameters found during optimization
best_params = study.best_params

In [47]:

# simple SVC training && prediction(dev & test)
svc = SVC(probability=True)
svc.fit(Xtrain_tf, ytrain)
ydev_svc= svc.predict_proba(Xdev_tf)
ytest_svc = svc.predict_proba(Xtest_tf)

# cross-val SVC training && prediction (dev & test)
best_svc= SVC(**best_params, probability=True)
best_svc.fit(Xtrain_tf, ytrain)

ydev_b_svc = best_svc.predict_proba(Xdev_tf) # predict on dev
ytest_b_svc = best_svc.predict_proba(Xtest_tf) # predict on test

# calculate cross entropy
cross_entropy_loss1 = multiclass_cross_entropy(ydev_en, ydev_svc)
cross_entropy_loss2 = multiclass_cross_entropy(ydev_en, ydev_b_svc)


print(f"Cross Entropy loss for Simple SVC: {cross_entropy_loss1}")
print(f"Cross Entropy loss for Cross-Val SVC: {cross_entropy_loss2}")

<h2 align='center'> Text - Random Forests Classifier (RFC) </h2>

In [None]:
from sklearn.ensemble import RandomForestClassifier

def objective(trial, X, y):
    # define hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 50)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    # create Random Forest model with hyperparameters
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

    # perform cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')


    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, Xsmote, ysmote),
                n_trials=20)

best_params = study.best_params # best parameters found

In [None]:
# simple RFC training && prediction(dev)
rfc = RandomForestClassifier()
rfc.fit(Xsmote, ysmote)
ydev_rfc = rfc.predict_proba(Xdev_vec)
ytest_rfc = rfc.predict_proba(Xtest_vec)

# cross-val RFC training && prediction (dev & test)
best_rfc = RandomForestClassifier(**best_params)
best_rfc.fit(Xsmote, ysmote)

ydev_b_rfc = best_rfc.predict_proba(Xdev_vec) # predict on dev
ytest_b_rfc = best_rfc.predict_proba(Xtest_vec) # predict on test



cross_entropy_loss1 = multiclass_cross_entropy(ydev_en, ydev_rfc)
cross_entropy_loss2 = multiclass_cross_entropy(ydev_en, ydev_b_rfc)

print(f"Cross Entropy loss for Simple RFC: {cross_entropy_loss1}")
print(f"Cross Entropy loss for Cross-Val RFC: {cross_entropy_loss2}")

2024-04-22 13:25:26.162778: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-22 13:25:26.605326: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-22 13:25:27.578717: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/meizeus/miniconda3/envs/tf_env/lib/:/home/meizeus/miniconda3/envs/tf_env/lib/
2024-04-22 13:25:27.579651: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'lib

Cross Entropy loss for Simple RFC: 1.3186117134381206
Cross Entropy loss for Cross-Val RFC: 1.2913366562040212


<h2 align='center'> Text - Extreme Gradient Boosting (XGB) </h2>

In [None]:
%pip install xgboost

In [79]:
import xgboost as xgb
import optuna
from sklearn.model_selection import KFold, cross_val_score

def objective(trial, X, y):
    # hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    # XGBoost model with hyperparameters
    model = xgb.XGBClassifier(**params)

    # cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

    return scores.mean()


study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, Xtrain_tf, ytrain.astype(int)),
               n_trials=10)

best_params = study.best_params  # best params


[I 2024-04-23 23:35:54,840] A new study created in memory with name: no-name-92fd79a6-8305-4c2a-8c4c-71db1c6a3aca
[I 2024-04-23 23:39:00,254] Trial 0 finished with value: 0.5793357933579336 and parameters: {'n_estimators': 312, 'max_depth': 10, 'learning_rate': 0.03488698940134258, 'subsample': 0.8844072438281029, 'colsample_bytree': 0.852226019848388, 'gamma': 0.010822832144741248, 'reg_alpha': 0.004625961829841675, 'reg_lambda': 0.0012732722637607934, 'min_child_weight': 2}. Best is trial 0 with value: 0.5793357933579336.
[I 2024-04-23 23:40:24,131] Trial 1 finished with value: 0.44723247232472324 and parameters: {'n_estimators': 927, 'max_depth': 4, 'learning_rate': 0.04310189519532882, 'subsample': 0.877897825777705, 'colsample_bytree': 0.7531391500284812, 'gamma': 1.334757665373186e-08, 'reg_alpha': 0.07508281970337145, 'reg_lambda': 7.787272278916647e-07, 'min_child_weight': 8}. Best is trial 0 with value: 0.5793357933579336.
[I 2024-04-23 23:42:34,466] Trial 2 finished with valu

In [80]:
# simple XGB training && prediction(dev)
Xbg = xgb.XGBClassifier()
Xbg.fit(Xtrain_tf, ytrain.astype(int))
ydev_xgb= Xbg.predict_proba(Xdev_tf)
ytest_xgb = Xbg.predict_proba(Xtest_tf)

# cross-val XGB training && prediction (dev & test)
best_xgb = xgb.XGBClassifier(**best_params)
best_xgb.fit(Xtrain_tf, ytrain.astype(int))

ydev_b_xgb = best_xgb.predict_proba(Xdev_tf) # predict on dev
ytest_b_xgb = best_xgb.predict_proba(Xtest_tf) # predict on test


cross_entropy_loss1 = multiclass_cross_entropy(ydev_en, ydev_xgb)
cross_entropy_loss2 = multiclass_cross_entropy(ydev_en, ydev_b_xgb)

print(f"Cross Entropy loss for Simple XGB: {cross_entropy_loss1}")
print(f"Cross Entropy loss for Cross-Val XGB: {cross_entropy_loss2}")

Cross Entropy loss for Simple XGB: 1.37209509138445
Cross Entropy loss for Cross-Val XGB: 1.1555710756852113


<h2 align='center'> Text - Ada Boosting Classifier (Ada) </h2>

In [107]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold, cross_val_score
import optuna

def objective(trial, X, y):
    # Hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'algorithm': 'SAMME'
    }

    # AdaBoost model with hyperparameters
    model = AdaBoostClassifier(**params, random_state=42)

    # Cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, Xtrain_tf, ytrain.astype(int)),
               n_trials=15)

best_params = study.best_params  # Best params

[I 2024-04-24 00:56:19,257] A new study created in memory with name: no-name-3c3e8f72-76cd-4131-95b1-8ab06cc4c01a
[I 2024-04-24 00:56:58,309] Trial 0 finished with value: 0.30848708487084875 and parameters: {'n_estimators': 549, 'learning_rate': 0.045348463074254}. Best is trial 0 with value: 0.30848708487084875.
[I 2024-04-24 00:57:03,368] Trial 1 finished with value: 0.30922509225092254 and parameters: {'n_estimators': 69, 'learning_rate': 0.012948976434218398}. Best is trial 1 with value: 0.30922509225092254.
[I 2024-04-24 00:57:48,131] Trial 2 finished with value: 0.3077490774907749 and parameters: {'n_estimators': 624, 'learning_rate': 0.035161746255972084}. Best is trial 1 with value: 0.30922509225092254.
[I 2024-04-24 00:58:26,242] Trial 3 finished with value: 0.3077490774907749 and parameters: {'n_estimators': 517, 'learning_rate': 0.061506012853640095}. Best is trial 1 with value: 0.30922509225092254.
[I 2024-04-24 00:58:33,613] Trial 4 finished with value: 0.3084870848708487 

In [108]:
import xgboost as xgb
# simple ADA training && prediction(dev)
ada = AdaBoostClassifier(algorithm='SAMME')
ada.fit(Xtrain_tf, ytrain.astype(int))
ydev_ada= ada.predict_proba(Xdev_tf)
ytest_ada = ada.predict_proba(Xtest_tf)

# cross-val ADA training && prediction (dev & test)
best_ada = AdaBoostClassifier(**best_params, algorithm='SAMME')
best_ada.fit(Xtrain_tf, ytrain.astype(int))

ydev_b_ada = best_ada.predict_proba(Xdev_tf) # predict on dev
ytest_b_ada = best_ada.predict_proba(Xtest_tf) # predict on test



cross_entropy_loss1 = multiclass_cross_entropy(ydev_en, ydev_ada)
cross_entropy_loss2 = multiclass_cross_entropy(ydev_en, ydev_b_ada)

print(f"Cross Entropy loss for Simple ADA: {cross_entropy_loss1}")
print(f"Cross Entropy loss for Cross-Val ADA: {cross_entropy_loss2}")

Cross Entropy loss for Simple ADA: 2.191959877410625
Cross Entropy loss for Cross-Val ADA: 2.1753044606631042


<h2 align='center'> Text - Gaussian Naive Bayes (NB) </h2>

In [135]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score
import optuna

def objective(trial, X, y):
    # Hyperparameters to optimize
    params = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-10, 1e-5, log=True)
    }

    # Naive Bayes model with hyperparameters
    model = GaussianNB(**params)

    # Cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, Xtrain_tf.toarray(), ytrain),
               n_trials=1000)

best_params = study.best_params  # Best params


[I 2024-04-24 01:15:49,468] A new study created in memory with name: no-name-a3827f7a-1cf8-4b80-8ede-8436cd7ca150
[I 2024-04-24 01:15:49,917] Trial 0 finished with value: 0.36974169741697416 and parameters: {'var_smoothing': 5.095382719328233e-07}. Best is trial 0 with value: 0.36974169741697416.
[I 2024-04-24 01:15:50,328] Trial 1 finished with value: 0.3660516605166052 and parameters: {'var_smoothing': 1.408467950027675e-10}. Best is trial 0 with value: 0.36974169741697416.
[I 2024-04-24 01:15:50,778] Trial 2 finished with value: 0.37564575645756454 and parameters: {'var_smoothing': 2.007933629715741e-06}. Best is trial 2 with value: 0.37564575645756454.
[I 2024-04-24 01:15:51,210] Trial 3 finished with value: 0.3675276752767528 and parameters: {'var_smoothing': 1.2675361456706667e-07}. Best is trial 2 with value: 0.37564575645756454.
[I 2024-04-24 01:15:51,659] Trial 4 finished with value: 0.36974169741697416 and parameters: {'var_smoothing': 2.559907414839985e-07}. Best is trial 2 

In [137]:
# simple ADA training && prediction(dev)
nb = GaussianNB()
nb.fit(Xtrain_tf.toarray(), ytrain.astype(int))
ydev_nb= nb.predict_proba(Xdev_tf.toarray())
ytest_nb = nb.predict_proba(Xtest_tf.toarray())

# cross-val ADA training && prediction (dev & test)
best_nb = GaussianNB(**best_params)
best_nb.fit(Xtrain_tf.toarray(), ytrain.astype(int))

ydev_b_nb = best_nb.predict_proba(Xdev_tf.toarray()) # predict on dev
ytest_b_nb = best_nb.predict_proba(Xtest_tf.toarray()) # predict on test



cross_entropy_loss1 = multiclass_cross_entropy(ydev_en, ydev_nb)
cross_entropy_loss2 = multiclass_cross_entropy(ydev_en, ydev_b_nb)

print(f"Cross Entropy loss for Simple ADA: {cross_entropy_loss1}")
print(f"Cross Entropy loss for Cross-Val ADA: {cross_entropy_loss2}")

Cross Entropy loss for Simple ADA: 23.33082908795291
Cross Entropy loss for Cross-Val ADA: 21.04349290285949


#### ==================================== NEURAL NETS =====================================================

In [28]:
import tensorflow as tf
import keras
from keras.layers import InputLayer
from keras.layers import Dropout, Dense, Input,BatchNormalization
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras.metrics import CategoricalCrossentropy
import keras_tuner as kt
from tensorflow.keras.callbacks import EarlyStopping

2024-04-23 22:31:29.770907: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-23 22:31:29.890992: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-23 22:31:30.256960: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/meizeus/miniconda3/envs/tf_env/lib/
2024-04-23 22:31:30.258048: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_

#### Mini dataset for node graph

In [61]:
Xtrain.info()
Xtrain_g = Xtrain.drop(['domain_name','text','tokens'], axis=1)
Xdev_g = Xdev.drop(['domain_name','text','tokens'], axis=1)
Xtest_g = tests.drop(['domain_name','text'], axis=1)

<class 'pandas.core.frame.DataFrame'>
Index: 1355 entries, 203 to 1126
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   domain_name   1355 non-null   object 
 1   in_deg        1355 non-null   float64
 2   out_deg       1355 non-null   float64
 3   avg_neig_deg  1355 non-null   float64
 4   text          1355 non-null   object 
 5   tokens        1355 non-null   object 
dtypes: float64(3), object(3)
memory usage: 106.4+ KB


#### Hyperparameter tuning using Keras Tuner library

In [36]:
from sklearn.metrics import f1_score, recall_score, precision_score

class Metrics(Callback):
    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict_proba = self.model.predict(self.validation_data[0])
        val_predict = (val_predict_proba > 0.5).astype(int)  # convert probabilities to classes

        val_targ = self.validation_data[1]

        _val_f1 = f1_score(val_targ, val_predict, average="binary")
        _val_recall = recall_score(val_targ, val_predict, average="binary")
        _val_precision = precision_score(val_targ, val_predict, average="binary")

        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(" — val_f1: %.3f — val_precision: %.3f — val_recall: %.3f" % (_val_f1, _val_precision, _val_recall))


#### KERAS TUNER

In [44]:
def build_model(hp):
    model = Sequential()

    layer_index = 0
    for i in range(hp.Int(name='num_layers',min_value=1,max_value=5)):
        if layer_index == 0:
            model.add(Dense(hp.Int(name='hidden_units_'+str(i),min_value=128,max_value=1000,step=64),
                            activation=hp.Choice(name='activation_layer'+str(i),values=['relu','tanh','sigmoid']),
                            input_dim=Xtrain_g.shape[1]
                           ))
            model.add(Dropout(hp.Choice(name='dropout_layer_'+str(i),values=[0.1,0.2,0.3,0.4,0.5])))
        else:
            model.add(Dense(hp.Int(name='hidden_units_'+str(i),min_value=128,max_value=512,step=64),
                            activation=hp.Choice(name='activation_layer'+str(i),values=['relu','tanh'])))
            model.add(Dropout(hp.Choice(name='dropout_layer_'+str(i),values=[0.1,0.2,0.3,0.4,0.5])))

        layer_index += 1

    # add last layer that produces the logits
    model.add(Dense(9,  activation='softmax'))

    # tune the learning rate for the optimizer
    # choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4])
    model.compile(loss='categorical_crossentropy',optimizer=Adam(learning_rate=hp_learning_rate),
                  metrics=[CategoricalCrossentropy()])

    return model

# begin the tuning
tuner = kt.RandomSearch(build_model,
                        objective=kt.Objective('val_categorical_crossentropy',
                                               direction='min'),
                        max_trials=50,
                        directory='../../../../../data_challenge/new_KT_dir',
                        project_name='KT_tuning')
early_stopping = EarlyStopping(
    monitor='val_loss', patience=7)
tuner.search_space_summary()

tuner.search(Xtrain_g, ytrain_en,
             validation_data=(Xdev_g, ydev_en), epochs=1000, batch_size = 64,
             callbacks=[early_stopping])

Trial 50 Complete [00h 00m 14s]
val_categorical_crossentropy: 1.992177963256836

Best val_categorical_crossentropy So Far: 1.8619556427001953
Total elapsed time: 00h 17m 40s


#### Get the five best MLPs (trained on graph-data)

In [62]:
# after the end of the tuning get the best models parameters
tuner.results_summary()
# tuner.get_best_hyperparameters()[0].values

# get the 4 best models evaluated by the tuner
mlp1 = tuner.get_best_models(num_models=5)[0]
mlp2 = tuner.get_best_models(num_models=5)[1]
mlp3 = tuner.get_best_models(num_models=5)[2]
mlp4 = tuner.get_best_models(num_models=5)[3]
mlp5 = tuner.get_best_models(num_models=5)[4]


# predict test graph
mlptest1 = mlp1.predict(Xtest_g) ;
mlptest2 = mlp2.predict(Xtest_g) ;
mlptest3 = mlp3.predict(Xtest_g) ;
mlptest4 = mlp4.predict(Xtest_g) ;
mlptest5 = mlp5.predict(Xtest_g) ;


# predict dev graph
mlpdev1 = mlp1.predict(Xdev_g) ;
mlpdev2 = mlp2.predict(Xdev_g) ;
mlpdev3 = mlp3.predict(Xdev_g) ;
mlpdev4 = mlp4.predict(Xdev_g) ;
mlpdev5 = mlp5.predict(Xdev_g) ;


Results summary
Results in ../../../../../data_challenge/new_KT_dir/KT_tuning
Showing 10 best trials
Objective(name="val_categorical_crossentropy", direction="min")

Trial 27 summary
Hyperparameters:
num_layers: 4
hidden_units_0: 768
activation_layer0: sigmoid
dropout_layer_0: 0.4
learning_rate: 0.0001
hidden_units_1: 448
activation_layer1: tanh
dropout_layer_1: 0.4
hidden_units_2: 512
activation_layer2: tanh
dropout_layer_2: 0.5
hidden_units_3: 192
activation_layer3: relu
dropout_layer_3: 0.3
hidden_units_4: 128
activation_layer4: relu
dropout_layer_4: 0.2
Score: 1.8619556427001953

Trial 10 summary
Hyperparameters:
num_layers: 4
hidden_units_0: 384
activation_layer0: sigmoid
dropout_layer_0: 0.4
learning_rate: 0.001
hidden_units_1: 384
activation_layer1: relu
dropout_layer_1: 0.2
hidden_units_2: 256
activation_layer2: relu
dropout_layer_2: 0.1
hidden_units_3: 448
activation_layer3: tanh
dropout_layer_3: 0.3
hidden_units_4: 384
activation_layer4: tanh
dropout_layer_4: 0.4
Score: 1.864

### ENSEMBLES - (soft voting) graph+text preds

In [55]:
ytest_lrcv.shape,ytest_b_svc.shape,ytest_svc.shape,mlptest1.shape,mlptest2.shape,mlptest3.shape, mlptest4.shape,mlptest5.shape

((605, 9),
 (605, 9),
 (605, 9),
 (151, 9),
 (151, 9),
 (151, 9),
 (151, 9),
 (151, 9))

In [138]:
# -- combine probabilities of all the models we want
probs_devg = np.stack([ydev_lrcv,
                       ydev_svc,ydev_b_svc,
                       ydev_b_xgb,ydev_xgb,
                       ydev_b_nb,mlpdev1,mlpdev2
                       ], axis=2)

avg_probs_devg = np.mean(probs_devg, axis=2) # average probability for each class all clfs
avg_probs_devg.shape

(151, 9)

In [139]:
ensemble_cross_en = multiclass_cross_entropy(ydev_en, avg_probs_devg)
ensemble_cross_en

1.1992686349915866

In [129]:
### ----- Ensebles prediction on test ----------

all_probs_stacked = np.stack([ytest_lrcv,
                       ytest_svc,ytest_b_svc,
                       ytest_b_xgb,ytest_xgb,ytest_b_nb], axis=2)

# Calculate the average probability for each class across all classifiers
avg_probs_test = np.mean(all_probs_stacked, axis=2)

avg_probs_test.shape

(605, 9)

In [None]:

all_probs_stacked = np.stack([ytest_lrcv,ytest_rfc,ytest_b_svc, ytest_b_xgb], axis=2)

# Calculate the average probability for each class across all classifiers
average_probs = np.mean(all_probs_stacked, axis=2)

average_probs.shape

(605, 9)

In [None]:
# write predictions to a file
with open('../../../../../text_sample_new_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(9):
        lst.append('class_'+str(i))
    lst.insert(0, "domain_name")
    writer.writerow(lst)
    for i,test_domain in enumerate(test_domains):
        lst = average_probs[i,:].tolist()
        # print(lst)
        lst.insert(0, test_domain)
        writer.writerow(lst)