I have arranged the notebook in the following way:
1. Install necessary libraries and import them.
2. Define some functions for EDA
3. Preprocess text and do the EDA
4. Load Embeddings and check coverage
5. Build graphs for each sample
6. Define Dataset and Models
7. Define functions for training and validating
8. Make stratified splits and train models on different splits. Log results in real time using wandb
9. Plot the evaluation curves
10. Make predictions on test set by blending the predictions of saved models


I tried improving the method at 3 levels. 
1. **Input level**: Trained a model on unweighted and weighted graphs. Weighted graphs gave better results. So, continued with them.
2. **Model level**: Implemented Graph Convolutional NN and Graph Attention NN with different readout functions like maxpooling, avgpooling and combination of them. Graph Attention NN with dropout and avgpooling as the readout function gave best metric scores across all folds
3. **Output level**: Used weighted and unweighted Cross Entropy Loss. Weights were assigned to each class as the inverse of their frequency. Unweighted gave better results


**Link for comparison between different methods**: https://wandb.ai/vbagal/gnn?workspace=user-vbagal


**1. Average of max validation f1 over all folds:  0.609**
**2. Average of max validation auc over all folds:  0.962**
**3. Average of max validation mrr over all folds:  0.734**

# Install Libraries

In [None]:
!pip install openpyxl
!pip install dgl==0.6a210210

# Import Libraries

In [None]:
import dgl
from dgl.dataloading.pytorch import GraphDataLoader
from dgl.data import DGLDataset
from dgl.nn.pytorch import GraphConv, GATConv, GatedGraphConv, DotGatConv
from dgl.nn import AvgPooling, MaxPooling

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

import operator 

import pandas as pd
import pickle

import string
import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
import scipy.sparse as sp

from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
import torch.nn as nn


from wordcloud import WordCloud
import wandb
import warnings


warnings.filterwarnings('ignore')

sns.set_theme()
sns.set_context("talk")

stopwords=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

seed = 42

# EDA Functions

In [None]:
def get_label_dist(df, top_k = 10, reverse = False):
    """
    Plots barplot of count of labels in the dataset

        Parameters:
            df (pandas DataFrame): pandas DataFrame with 'target' as the label column
            top_k (int): plot top_k labels accorinding to the count. Default 10
            reverse (bool): whether to plot top_k most prevalent or least prevalent. Default False
            
        Returns:
            Nothing
    """
    
    
    
    val_count = df['target'].value_counts(normalize = True)
    
    if not reverse:
        for ind in val_count.index[:top_k]:
            print(f'{ind} : {idx2label[ind]}')
    else:
        for ind in val_count.index[-top_k:]:
            print(f'{ind} : {idx2label[ind]}')
            
    plt.figure(figsize = (20,6))
    if reverse:
        sns.barplot(x = list(map(str, val_count.index[-top_k:])), y = val_count.values[-top_k:], palette = 'Blues_r')
        plt.title(f'Distribution of normalized count of bottom {top_k} labels')
    else:
        sns.barplot(x = list(map(str, val_count.index[:top_k])), y = val_count.values[:top_k], palette = 'Blues_r')
        plt.title(f'Distribution of normalized count of top {top_k} labels')
    plt.xticks()
    
    
    
def filter_text(text):

    """
    Returns the lowercase of input text by removing punctuations, stopwords

        Parameters:
            text (str): input string
            
        Returns:
            string
    """
    
    tokenized_words=word_tokenize(text)
    filtered_words=[word.strip().strip('.').lower() for word in tokenized_words if ((word.lower() not in string.punctuation) &
                                                                 (word.lower() not in stopwords))]
    
    # I trained a GNN on lemmatized text, the overall 5-fold avg metric scores on all the metrics were lower on lemmatized text compared to non-lemmatized. 
#     stemmed_words=[lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(filtered_words)


def remove_words_nums(text):

    """
    Returns the input text by removing names of months, the word 'company' and numerics

        Parameters:
            text (str): input string
            
        Returns:
            string
    """
    
    remove_words = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'company']
    
    filtered_text = []
    for w in text.split():
        if (not w.isnumeric() and w not in remove_words):
            filtered_text.append(w)

    return ' '.join(filtered_text)


def plot_textlen_dist(df, label_list, unique = True):
    """
    Plots histogram of number of words in each sample for given list of labels

        Parameters:
            df (pandas DataFrame): pandas DataFrame with 'target' as the label column
            label_list (list): list of labels to plot
            unique (bool): whether to count only unique words. Default True
            
        Returns:
            Nothing
    """
    
    assert isinstance(label_list, list), 'label_list must be a list'
    assert all(x in labels for x in label_list), f"The industry must belong to one of {', '.join(labels)}"
    
    if unique:
        col = 'unique_text_len'
        title = 'Distribution of number of unique words in description'
    else:
        col = 'text_len'
        title = 'Distribution of number of words in description'
        
    plt.figure(figsize = (10,6))
    
    for ind in label_list:
        sns.distplot(df[df['target'] == label2idx[ind]][col], label = ind, kde = False, norm_hist = True)
        plt.legend()
        
    plt.title(title)
    plt.xlabel('Text Length')
    

def plot_wordcloud(text, label, max_font_size=40, max_words=50):
    
    """
    Plots wordcloud of the input text

        Parameters:
            text (str): input string
            label (str): industry tag. Used in title
            max_font_size (int): maximum font size of the most frequent word. Default 40
            max_words (int): maximum words to show in the wordcloud. Default 50
            
        Returns:
            Nothing
    """
    
    plt.figure(figsize=(10,5))
    wordcloud=WordCloud(max_font_size=max_font_size,max_words=max_words,random_state=seed)
    plot=wordcloud.generate(text)
    plt.title('Industry Tag: ' + label, fontsize = 25)
    plt.imshow(plot)
    plt.axis('off')
    plt.show()


def get_text(traindf, industry):

    """
    Returns string containing all text from input industry tag

        Parameters:
            traindf (pandas DataFrame): pandas DataFrame with 'target' as the label column and 'text' as the description column
            industry (str): industry tag for using text
            
        Returns:
            string 
    """
    
    assert industry in labels, f"The industry must belong to one of {', '.join(labels)}"
    
    target=traindf[traindf['target']==label2idx[label]].reset_index(drop=True)

    target_text=' ' 

    for i in range(target.shape[0]):
        target_text+=target.text[i]
        
    return target_text


def plot_top_ngrams(traindf, industry, vectorizer = 'count', ngrams=(1,1),top=10,max_features=10000):
    
    """
    Plots barplot of count of top ngrams occurred in the description for given industry

        Parameters:
            traindf (pandas DataFrame): pandas DataFrame with 'target' as the label column and 'text' as the description column
            industry (str): industry tag to plot
            vectorizer (str): which vectorizer to use. Can be one of 'count' and 'tfidf'. Default 'count'
            ngrams (tuple/int): ngrams to consider. Default (1,1)
            top (int): how many top ngrams to show in the plot. Default 10
            max_features (int): maximum ngrams to form. 
            
        Returns:
            Nothing
    """
    
    assert vectorizer in ['count', 'tfidf'], "vectorizer must be one of 'count' and 'tfidf'"
    assert industry in labels, f"The industry must belong to one of {', '.join(labels)}"
    
    text = traindf[traindf['target']==label2idx[industry]]['text']
    
    vector_dict = {'count': CountVectorizer, 'tfidf': TfidfVectorizer}
    
    cv=vector_dict[vectorizer](ngram_range=ngrams,max_features=max_features)
    trans_text=cv.fit_transform(text)
    col_sum=trans_text.sum(axis=0)
    word_index=[(word,col_sum[0,idx]) for word,idx in cv.vocabulary_.items()]
    sorted_word_index=sorted(word_index,key=lambda x:x[1],reverse=True)
    top_words_index=sorted_word_index[:top]
    top_words=[element[0] for element in top_words_index]
    counts=[element[1] for element in top_words_index]

    plt.figure(figsize=(20,6))
    sns.barplot(x = top_words, y = counts, palette = 'Blues_r')
    plt.title('Industry Tag: ' + industry)
    plt.xticks(rotation = 45)


def get_tsne_array(traindf, vectorizer = 'count', ngrams = (1,3)):
    
    """
    Returns array of shape (len(traindf), 2). The array is obtained by operating 'count' of 'tfidf' vectorizer over each text, 
    followed by TruncatedSVD to reduce the number of columns to 50 and then finally, applying TSNE to obtain 2 columns. 

        Parameters:
            traindf (pandas DataFrame): pandas DataFrame with 'text' as the description column
            vectorizer (str): which vectorizer to use. Can be one of 'count' and 'tfidf'. Default 'count'
            ngrams (tuple/int): ngrams to consider. Default (1,1)

        Returns:
            Array of shape (len(traindf), 2)
    """
    
    assert vectorizer in ['count', 'tfidf'], "vectorizer must be one of 'count' and 'tfidf'"
    vector_dict = {'count': CountVectorizer, 'tfidf': TfidfVectorizer}
    
    cv=vector_dict[vectorizer](ngram_range=ngrams,analyzer='word')
    train_X_cv=cv.fit_transform(traindf['text'].values)
    
    tsvd=TruncatedSVD(n_components=50,random_state=seed)
    train_X_svd=tsvd.fit_transform(train_X_cv)

    tsne=TSNE(n_components=2,random_state=seed)
    train_X_tsne=tsne.fit_transform(train_X_svd)
    
    return train_X_tsne


def plot_scatter(train_X_tsne, traindf, label_list):
    
    """
    Plots scatterplot of the first two features of the train_X_tsne array. 

        Parameters:
            train_X_tsne (array): Array of shape (len(traindf), n) where n >= 2 
            traindf (pandas DataFrame): pandas DataFrame with 'target' as the label column
            label_list (list): list of labels to plot
            
        Returns:
            Nothing
    """
    
    assert isinstance(label_list, list), 'label_list must be a list'
    assert all(x in labels for x in label_list), f"The industry must belong to one of {', '.join(labels)}"
    
    
    indices= traindf[traindf['target'].isin([label2idx[x] for x in label_list])].index
    label_series = traindf[traindf['target'].isin([label2idx[x] for x in label_list])]['target'].values
   
    tsne_array = train_X_tsne[indices, :]
    
    df=pd.DataFrame()
    df['tsne1']=pd.Series(tsne_array[:,0])
    df['tsne2']=pd.Series(tsne_array[:,1])
    df['target']=pd.Series(label_series)
    
    fig, ax = plt.subplots(figsize=(15,6))
    scatter = sns.scatterplot(df['tsne1'],df['tsne2'],hue=df['target'], ax = ax, palette = 'tab10')
    move_legend(ax, idx2label)
    plt.xlabel('TSNE Axis 1')
    plt.ylabel('TSNE Axis 2')
    plt.show()
    
    
def move_legend(ax, idx2label):

    """
    Adds legend to a plot

        Parameters:
            ax (Axes): Axes object 
            idx2label (dict): dictionary with ids as keys and labels as values
            
        Returns:
            Nothing
    """
    
    
    old_legend = ax.legend_
    handles = old_legend.legendHandles
    labels = [idx2label[int(t.get_text())] for t in old_legend.get_texts()]

    title = old_legend.get_title().get_text()
    ax.legend(handles, labels, title=title)

# Exploratory Data Analysis

Here I load the train dataset and remove rows with na values

In [None]:
traindf = pd.read_excel('../input/hr-string/train.xlsx')
traindf.columns = ['company', 'text', 'target']


print('Checking and removing rows with na values in the dataset \n')
print(traindf.isna().sum())
traindf = traindf.dropna(axis = 0).reset_index(drop = True)
print('*'*50)
print('After processing \n')
print(traindf.isna().sum())

In [None]:
traindf.head()

Now, I create mapping from labels to ids and ids to labels. This will facilitate easy conversion from one space to other. Then, I convert all labels to ids. We see that there are 62 unique industry tags.

In [None]:
labels = traindf['target'].unique()
print('Number of unique industry tags: ', len(labels))
label2idx = {l:i for i,l in enumerate(sorted(labels))}
idx2label = {v:k for k,v in label2idx.items()}

traindf['target'] = traindf['target'].apply(lambda x: label2idx[x])

Now lets check the distribution of count of labels. For that, I plot the normalized count of top 10 and bottom 10 labels. The top_k argument allows to plot the count of the 'k' most prevalent industry tags and 'reverse' argument. The printed values indicate the id to label mapping for the 10 labels. The xaxis ticks are ids because the labels were quite big and it wasn't aesthetic. 

From this plot, we see that 'Regional Banks' and 'Real Estate Operating Companies' are the most and least common tags constituting around 9% and 0.8% of total labels respectively. So, the data is highly imbalanced. While training our model, using weighted loss function with inverse of class frequency as weights might help. 

In [None]:
get_label_dist(traindf, top_k = 10, reverse = False)

In [None]:
get_label_dist(traindf, top_k = 10, reverse = True)

Now, I filter the text by removing stopwords and punctuations. I also apply stemmer on the text. Then I calculate num of words and num of unique words in each text. 

In [None]:
traindf['text'] = traindf['text'].apply(lambda x: filter_text(x))
traindf['text_len'] = traindf['text'].apply(lambda x: len(x.split()))
traindf['unique_text_len'] = traindf['text'].apply(lambda x: len(set(x.split())))

I check the maximum and minimum of the labelwise mean text length and their industry tags. I use these tags to plot their distributions just to check if there is large difference in them. 

In [None]:
print('Max of mean text length: ', traindf.groupby(by='target')['text_len'].mean().max())
print('Tag of max of mean text length: ', idx2label[traindf.groupby(by='target')['text_len'].mean().argmax()])

print('Min of mean text length: ', traindf.groupby(by='target')['text_len'].mean().min())
print('Tag of min of mean text length: ', idx2label[traindf.groupby(by='target')['text_len'].mean().argmin()])

Here, we see that 135 is the max text length. I think we can use the whole text for building graphs as 135 is not that big. But, if it becomes computationally heavy to train with max text length of 135, we can truncate down to around 95 which will include around 75% of full length samples. 

In [None]:
print(traindf['text_len'].describe())

I plot labelwise normalized distribution of number of words and number of unique words in the next two plots. Even though the 'Oil & Gas Storage & Transportation' has longer texts on average than 'Asset Management & Custody Banks', the average of number of unique words is more or less same. 

In [None]:
plot_textlen_dist(traindf, ['Oil & Gas Storage & Transportation', 'Asset Management & Custody Banks'], unique = False)

In [None]:
plot_textlen_dist(traindf, ['Oil & Gas Storage & Transportation', 'Asset Management & Custody Banks'], unique = True)

Now, I plot the distribution of labelwise mean of text length and unique text length. We see that most of the labels have mean text length around 82.

In [None]:
plt.figure(figsize = (20,6))

plt.subplot(1,2,1)
sns.kdeplot(traindf.groupby(by='target')['text_len'].mean())
plt.xlabel('Text Length')
plt.title('Distribution of labelwise mean of text length')

plt.subplot(1,2,2)
sns.kdeplot(traindf.groupby(by='target')['unique_text_len'].mean())
plt.xlabel('Text Length')
plt.title('Distribution of labelwise mean of unique text length')

plt.show()

Here, I try to see which words are labelwise most common. In case of biotechnology, words like product, candidate, company, therapeutic, cancer, clinical trial, etc. are most common. While choosing pre-trained embeddings, we should try to maximize the overlap between the words in text and those in the embedding matrix. Jargon words like receptor, inhibitor, therapeutic might not be present in the embedding matrix.

In [None]:
label = 'Biotechnology'

target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Advertising'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Casinos & Gaming'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Health Care Facilities'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Movies & Entertainment'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Restaurants'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

Now I check the count of top_k ngrams for some industry tags. This will allow us to spot co-occurences. While building graph, an edge will be created between co-occuring words. You can use Tfidf or Count vectorizer and also visualize different ngrams. 

We see that the names of months and dates occur many times. Also, the word 'company' occurs many times. I don't think these tokens provide much info. So, we will remove it with further processing.

In [None]:
label = 'Biotechnology'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Advertising'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Casinos & Gaming'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Health Care Facilities'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Movies & Entertainment'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Restaurants'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

In [None]:
traindf['text'] = traindf['text'].apply(remove_words_nums)

Now we definitely see more informative co-occurences than before due to our preprocessing. 

In [None]:
label = 'Biotechnology'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Advertising'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Casinos & Gaming'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Health Care Facilities'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Movies & Entertainment'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

label = 'Restaurants'
plot_top_ngrams(traindf, label, vectorizer = 'tfidf', ngrams = (2,2), top = 20)

I plot the wordcloud again

In [None]:
label = 'Biotechnology'

target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Advertising'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Casinos & Gaming'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Health Care Facilities'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Movies & Entertainment'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

label = 'Restaurants'
target_text = get_text(traindf, label)
plot_wordcloud(target_text, label)

Now I apply tfidf or count vectorizer over the text and then apply TruncatedSVD first and then TSNE to get two feature columns per sample. Then, I plot the data points labelwise. TruncatedSVD is faster than TSNE, that is why, I first applied TruncatedSVD to reduce the number of columns obtained from either tfidf or count vectorizer. Intuitively, points belonging to similar tags should lie close to each other and those having very different tags should lie far. We see the same behaviour in the below plots. 

In [None]:
train_tsne = get_tsne_array(traindf, vectorizer = 'tfidf')

We can clearly see 3 different clusters because the 3 industries are very different. We find some intermingling between the 'Advertising' points and 'Consumer Finance' points. Such samples might be difficult to classify.

In [None]:
label_list = ['Advertising', 'Consumer Finance', 'Pharmaceuticals']   # 3 very different industry tags
plot_scatter(train_tsne, traindf, label_list)

label_list = ['Casinos & Gaming', 'Life Sciences Tools & Services', 'Regional Banks']   # 3 very different industry tags
plot_scatter(train_tsne, traindf, label_list)

label_list = ['Construction & Engineering', 'Hotels, Resorts & Cruise Lines', 'Semiconductors']   # 3 very different industry tags
plot_scatter(train_tsne, traindf, label_list)

We see that the clusters overlap because the 3 industries are very similar. We also spot that some points lie very far away from clusters. Such points might be difficult to correctly classify.

In [None]:
label_list = ['Health Care Equipment', 'Health Care Facilities', 'Health Care Technology'] # 3 closely related industry tags
plot_scatter(train_tsne, traindf, label_list)

label_list = ['Electric Utilities', 'Electronic Equipment & Instruments', 'Electrical Components & Equipment'] # 3 closely related industry tags
plot_scatter(train_tsne, traindf, label_list)

label_list = ['Oil & Gas Equipment & Services', 'Oil & Gas Exploration & Production', 'Oil & Gas Refining & Marketing'] # 3 closely related industry tags
plot_scatter(train_tsne, traindf, label_list)

# Preprocessing for Glove Embeddings

In [None]:
def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr


def check_coverage(vocab,embeddings_index):
    """
    Returns list of tuples. The first element of each tuple specifies the word present in the description but not in the embeddings  and the second element 
    specifies the count of that word in the descriptions. The tuples are sorted in the descending order of their count. 

        Parameters:
            vocab (dict): Dictionary with keys as words and values as their count of occurence
            embeddings_index (dict): Dictionary with keys as words and values as their embeddings
            
        Returns:
            List of tuples
    """
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def build_vocab(sentences, verbose =  True):
    """
    Returns dictionary with keys as words in the sentences and values as their count of occurence

        Parameters:
            sentences (list of list): List of lists of descriptions
            verbose (bool): whether to show the progress bar
            
        Returns:
            dictionary
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab



Load glove embeddings of 50 dimnesions

In [None]:
GLOVE_EMBEDDING_PATH = '../input/glove6b50dtxt/glove.6B.50d.txt'



word_embeddings = {}

with open(GLOVE_EMBEDDING_PATH, 'r') as f:
    for line in f.readlines():
        data = line.split()
        word_embeddings[str(data[0])] = list(map(float,data[1:]))

Check the overlap of words in the glove embeddings and the words in the descriptions. We see that only around 65% of unique words in the descriptions have the glove embeddings. But those 65% words constitute areound 95% of total words in the description. In the oov, we see that many words have hyphen in between them. Let us remove that and check the overlap again. 

In [None]:
vocab = build_vocab(list(traindf['text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,word_embeddings)
oov[:10]

In [None]:
traindf['text'] = traindf['text'].apply(lambda x: ' '.join(x.split('-')))

Now the coverage is around 71%. That is a significant improvement! This will allow us to use pre-trained embeddings for more words in the text. I don't see a straightforward way of increasing this coverage further. Now, lets build a graph. 

In [None]:
vocab = build_vocab(list(traindf['text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,word_embeddings)
oov[:10]

# Build graphs

Following Zhang et al., 'Every Document Owns Its Structure: Inductive Text Classification via Graph Neural Networks', ACL 2020, I construct per sample graph by representing unique words as vertices and co-occurrences between words as edges. The co-occurrences describe the relationship of words that occur within a fixed size sliding window (length 3 at default) and they are undirected in the graph

In [None]:
def build_graph(start, end, truncate = False, weighted_graph = True):
    """
    Returns list of adjacency matrix and list of node matrix 

        Parameters:
            start (int): start index of list
            end (int): end index of list
            truncate (bool): whether to truncate the text
            weighted_graph (bool): whether to use word pair count as the weight in adjacency matrix or just 1.0
            
        Returns:
            list of adjacency matrices, list of node matrices
    
    
    """
    x_adj = []
    x_feature = []
    doc_len_list = []
    vocab_set = set()

    for i in tqdm(range(start, end)):

        doc_words = shuffle_doc_words_list[i].split()
        if truncate:
            doc_words = doc_words[:MAX_TRUNC_LEN]
        doc_len = len(doc_words)

        doc_vocab = list(set(doc_words))
        doc_nodes = len(doc_vocab)

        doc_len_list.append(doc_nodes)
        vocab_set.update(doc_vocab)

        doc_word_id_map = {}
        for j in range(doc_nodes):
            doc_word_id_map[doc_vocab[j]] = j

        # sliding windows
        windows = []
        if doc_len <= window_size:
            windows.append(doc_words)
        else:
            for j in range(doc_len - window_size + 1):
                window = doc_words[j: j + window_size]
                windows.append(window)

        word_pair_count = {}
        for window in windows:
            for p in range(1, len(window)):
                for q in range(0, p):
                    word_p = window[p]
                    word_p_id = word_id_map[word_p]
                    word_q = window[q]
                    word_q_id = word_id_map[word_q]
                    if word_p_id == word_q_id:
                        continue
                    word_pair_key = (word_p_id, word_q_id)
                    # word co-occurrences as weights
                    if word_pair_key in word_pair_count:
                        word_pair_count[word_pair_key] += 1.
                    else:
                        word_pair_count[word_pair_key] = 1.
                    # bi-direction
                    word_pair_key = (word_q_id, word_p_id)
                    if word_pair_key in word_pair_count:
                        word_pair_count[word_pair_key] += 1.
                    else:
                        word_pair_count[word_pair_key] = 1.
    
        row = []
        col = []
        weight = []
        features = []

        for key in word_pair_count:
            p = key[0]
            q = key[1]
            row.append(doc_word_id_map[vocab[p]])
            col.append(doc_word_id_map[vocab[q]])
            weight.append(word_pair_count[key] if weighted_graph else 1.)
        adj = sp.csr_matrix((weight, (row, col)), shape=(doc_nodes, doc_nodes))
    
        for k, v in sorted(doc_word_id_map.items(), key=lambda x: x[1]):
            features.append(word_embeddings[k] if k in word_embeddings else oov[k])

        x_adj.append(adj)
        x_feature.append(features)

    
    return x_adj, x_feature





Args for building graphs and training model

In [None]:
class args:
    max_epochs = 20
    lr = 1e-3
    batch_size = 64
    embedding_dim = 50
    hidden_dim = 50
    num_heads = 8   # used for attention model
    n_folds = 5
    window_size = 3

Create a list of unique words in the whole dataset. Create a word to id dictionary and then a dictionary for out of vocabulary words as keys and their word embeddings following uniform distribution from -0.1 to 0.1

In [None]:
testdf = pd.read_excel('../input/hr-string/test.xlsx')
testdf.columns = ['company', 'text']

testdf['text'] = testdf['text'].apply(lambda x: filter_text(x))
testdf['text'] = testdf['text'].apply(lambda x: ' '.join(x.split('-')))
testdf['text'] = testdf['text'].apply(remove_words_nums)

In [None]:
    
word_embeddings_dim = args.embedding_dim

shuffle_doc_words_list = list(traindf['text'].values) + list(testdf['text'].values)

word_set = set()

for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    word_set.update(words)
    
vocab = list(word_set)
vocab_size = len(vocab)

word_id_map = {}
for i in range(vocab_size):
    word_id_map[vocab[i]] = i
    
oov = {}
for v in vocab:
    oov[v] = np.random.uniform(-0.1, 0.1, word_embeddings_dim)
    

window_size = args.window_size

print('building graphs for training')
x_adj, x_feature = build_graph(start=0, end=len(traindf), weighted_graph = True)

# Make Splits and Train

In [None]:
def check_train_test_word_overlap():
    
    """
    This function prints the fraction of unique words in test set that also occur in train set. 
            
        Returns:
            Nothing
    
    
    """
    train_unique_words = []

    for text in traindf['text']:
        train_unique_words.extend(text.split())

    train_unique_words = list(set(train_unique_words))

    testdf = pd.read_excel('../input/hr-string/test.xlsx')
    testdf.columns = ['company', 'text']

    test_unique_words = []

    for text in testdf['text']:
        test_unique_words.extend(text.split())

    test_unique_words = list(set(test_unique_words))

    overlap = [x for x in test_unique_words if x in train_unique_words]
    print('Intersection of words: ', np.round(len(overlap)/len(test_unique_words), 3))
    
def plot_results(hist_list):

    """
    Plots evaluation curves
    
        Parameters:
            hist (dict): dictionary with metric names as keys and their epochwise values in list as values
            
        Returns:
            Nothing
            
    """
        
    
    plt.figure(figsize = (20, 12))
    
    for i, (metric_name, metric) in enumerate(hist_list[0].items()):
        plt.subplot(2,2,i+1)
        train = [x[0] for hist in hist_list for x in hist[metric_name]]
        sns.lineplot(x = np.arange(len(metric)*args.n_folds), y = train, label = 'train')
        val = [x[1] for hist in hist_list for x in hist[metric_name]]
        sns.lineplot(x = np.arange(len(metric)*args.n_folds), y = val, label = 'val')
        sns.despine(right = True, top = True)
        plt.title(metric_name)
        
        if i > 1:
            plt.xlabel('Epochs')
        

Create a stratified 5 fold split for similar quantity of samples of each labels in all the folds. Print the value counts of each target in all folds just for seeing whether they are similar. Random state is very important because we want to compare different models on the same splits.

In [None]:

skf = StratifiedKFold(n_splits = args.n_folds, random_state = 42)
traindf['fold'] = -1

for idx, (_, val_idx) in enumerate(skf.split(traindf, traindf['target'])):
    traindf.loc[val_idx, 'fold'] = idx

In [None]:
for fold in range(args.n_folds):
    print(traindf[traindf['fold']==fold]['target'].value_counts(normalize = True))

Create the dataset and model classes

In [None]:

class GraphDataset(DGLDataset):
    """
    A dataset class 

    ...

    Attributes
    ----------
    x_adj (list): list of scipy sparse adjacency matrices
    
    x_feature (list): list of node matrices

    targets (list): list of industry tags


    """
    def __init__(self, x_adj, x_feature, targets = None):
        
        self.adj_matrix = x_adj
        self.node_matrix = x_feature
        self.targets = targets

        
    def __len__(self):
        return len(self.adj_matrix)
    
    def __getitem__(self, idx):
        
        """
        Returns a Graph and tensor of label
        
        """
        
        scipy_adj = self.adj_matrix[idx]
        G = dgl.from_scipy(scipy_adj)
#         feat = torch.zeros((len(self.node_matrix[idx]), 50))
#         self.n = self.node_matrix[idx]
#         for item in self.n:
#             feat[int(item[0])] = torch.tensor(item[1], dtype = torch.float)
            
#         G.ndata['feat'] = feat
        G.ndata['feat'] = torch.stack([torch.tensor(x, dtype = torch.float) for x in self.node_matrix[idx]])
        
        
        if self.targets is not None:
            label = self.targets[idx]
            
            return G, torch.tensor(label, dtype = torch.long)
        
        return G
    
    
import torch.nn.functional as F


# Graph Neural Network with normal Convolutional Layers
class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, hidden_dim)
        self.avgpooling = AvgPooling()
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        h = F.relu(self.conv2(g, h))
        h = self.avgpooling(g, h)
        
        return self.classify(h)
        
    

# Graph Neural Network with Attention Layers where the node features are concatenated for attention    
class GATClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_heads, n_classes):
        super(GATClassifier, self).__init__()
        self.hid_dim = hidden_dim
        self.gat1 = GATConv(in_dim, hidden_dim, num_heads)
        self.gat2 = GATConv(hidden_dim*num_heads, hidden_dim, 1)
        self.avgpooling = AvgPooling()
        self.drop = nn.Dropout(p = 0.3)
#         self.maxpooling = MaxPooling()
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        bs = h.shape[0]
        h = F.relu(self.gat1(g, h))
        h = h.reshape(bs, -1)
        h = F.relu(self.gat2(g, h))
        h = h.reshape(bs, -1)
        h = self.drop(h)
        h = self.avgpooling(g, h)
#         hmax = self.maxpooling(g, h)
#         h = torch.cat([havg, hmax], 1)
        
        return self.classify(h)
    
    
# Graph Neural Network with Attention Layers where a dot product is performed between node features  
class GATDotClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_heads, n_classes):
        super(GATDotClassifier, self).__init__()
        self.hid_dim = hidden_dim
        self.gat1 = DotGatConv(in_dim, hidden_dim, num_heads)
        self.gat2 = DotGatConv(hidden_dim*num_heads, hidden_dim, 1)
        self.avgpooling = AvgPooling()
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        bs = h.shape[0]
        h = F.relu(self.gat1(g, h))
        h = h.reshape(bs, -1)
        h = F.relu(self.gat2(g, h))
        h = h.reshape(bs, -1)
        h = self.avgpooling(g, h)
        
        return self.classify(h)
    

Use wandb to track different metrics. You can see all the logged curves over here: https://wandb.ai/vbagal/gnn?workspace=user-vbagal

In [None]:
wandb.init(project = 'gnn')

Define functions for fold training, 1 epoch training and validation

In [None]:


def train_fold(args, adj_list, node_list, fold = 0):

    """
    Returns dictionary with loss, f1, auc and mrr as the keys and list containing their epochwise scores as values.
    This function trains and validates a model over a fold of dataset

        Parameters:
            args (class): Class containing variables specifying values necessary for training model
            adj_list (list): list of adjacency matrices
            node_list (list): list of node matrices
            fold (int): fold to validate model on. 
            
        Returns:
            dictionary
    
    
    """
    
    train_idx = list(traindf[traindf['fold']!=fold].index)
    val_idx = list(traindf[traindf['fold']==fold].index)
    
    print('Num train samples ', len(train_idx))
    print('Num val samples ', len(val_idx))

    num_classes = traindf['target'].nunique()
    
    train = traindf[traindf['fold']!=fold].reset_index(drop = True)
    val = traindf[traindf['fold']==fold].reset_index(drop = True)
    
    train_adj_list, val_adj_list = [adj_list[i] for i in train_idx], [adj_list[i] for i in val_idx]
    train_node_list, val_node_list = [node_list[i] for i in train_idx], [node_list[i] for i in val_idx]
    train_label_list, val_label_list = train['target'].values, val['target'].values
#     weights = torch.tensor(1/pd.Series(train_label_list).value_counts().sort_index().values, dtype = torch.float)
    
    traindataset = GraphDataset(train_adj_list, train_node_list, train_label_list)
    valdataset = GraphDataset(val_adj_list, val_node_list, val_label_list)
    
    trainloader = GraphDataLoader(traindataset, batch_size = args.batch_size, shuffle = True)
    valloader = GraphDataLoader(valdataset, batch_size = args.batch_size, shuffle = False)
    
    model = GATClassifier(args.embedding_dim, args.hidden_dim, args.num_heads, num_classes)
    criterion = CrossEntropyLoss()   # weight = weights
    optimizer = torch.optim.Adam(model.parameters(), lr = args.lr)
    scheduler = None
    
    best_val_mrr = 0
    
    loss = []
    f1 = []
    auc = []
    mrr = []
    
    for idx in range(args.max_epochs):
        print(f'Epoch {idx + 1}/{args.max_epochs}')
        
        train_loss, train_f1, train_auc, train_mrr = train_one_epoch(trainloader, model, criterion, optimizer, scheduler, num_classes)
        val_loss, val_f1, val_auc, val_mrr = validate(valloader, model, criterion, num_classes)
        
        log_results(train_loss, train_f1, train_auc, train_mrr, val_loss, val_f1, val_auc, val_mrr, idx)
        
        loss.append((train_loss, val_loss))
        f1.append((train_f1, val_f1))
        auc.append((train_auc, val_auc))
        mrr.append((train_mrr, val_mrr))
        
        if val_mrr > best_val_mrr:
            torch.save(model.state_dict(), f'fold-{fold}.pt')
            best_val_mrr = val_mrr
            
    return {'loss': loss, 'f1': f1, 'auc': auc, 'mrr': mrr}

In [None]:
def train_one_epoch(trainloader, model, criterion, optimizer, scheduler, num_classes):

    """
    Returns training loss, f1, roc_auc and mrr scores over 1 epoch
    This function trains model for 1 epoch

        Parameters:
            trainloader (DataLoader/Iterable): dataloader that yields a batch for training
            model (nn.Module): model used for training
            criterion (nn.Module): loss function
            optimizer (Optimizer): used to optimize the loss function
            scheduler (Scheduler): used to change the learning rate over epochs
            num_classes (int): number of classes
            
        Returns:
            loss, f1, roc_auc and mrr floats
    
    
    """
    train_loss = 0
    train_f1 = 0
    train_auc = 0
    
    all_labels = []
    all_logits = []
    
    total = len(trainloader)
    model.train()
    for idx, (G, label) in tqdm(enumerate(trainloader), total = total):
                
        h = G.ndata['feat'].float()
        logit = model(G, h)
        loss = criterion(logit, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if scheduler:
            scheduler.step()
        
        label_numpy = label.detach().cpu().numpy()
        logit_numpy = logit.softmax(-1).detach().cpu().numpy()
        
        train_loss += loss.item()/total
        train_f1 += sklearn.metrics.f1_score(label_numpy, logit_numpy.argmax(-1), average = 'micro')/total
        
        all_labels.append(label_numpy)
        all_logits.append(logit_numpy)
    
    all_labels = np.concatenate(all_labels)
    all_logits = np.concatenate(all_logits)
    
    one_hot_labels= np.zeros((len(all_labels), num_classes))
    one_hot_labels[np.arange(len(all_labels)), all_labels] = 1.0
    
    train_auc = sklearn.metrics.roc_auc_score(all_labels, all_logits, multi_class = 'ovo', labels = np.array([int(i) for i in range(num_classes)]))
    train_mrr = sklearn.metrics.label_ranking_average_precision_score(one_hot_labels, all_logits)
    
    return train_loss, train_f1, train_auc, train_mrr


def validate(valloader, model, criterion, num_classes):

    """
    Returns validation loss, f1, roc_auc and mrr scores over 1 epoch
    This function validates the model 

        Parameters:
            valloader (DataLoader/Iterable): dataloader that yields a batch for validating
            model (nn.Module): model to be used for validation
            criterion (nn.Module): loss function
            num_classes (int): number of classes
            
        Returns:
            loss, f1, roc_auc and mrr floats
    
    
    """
    
    val_loss = 0
    val_f1 = 0
    val_auc = 0
    
    all_labels = []
    all_logits = []
    
    total = len(valloader)
    model.eval()
    
    with torch.no_grad():
        for idx, (G, label) in tqdm(enumerate(valloader), total = total):

            h = G.ndata['feat'].float()
            logit = model(G, h)
            loss = criterion(logit, label)

            label_numpy = label.detach().cpu().numpy()
            logit_numpy = logit.softmax(-1).detach().cpu().numpy()

            val_loss += loss.item()/total
            val_f1 += sklearn.metrics.f1_score(label_numpy, logit_numpy.argmax(-1), average = 'micro')/total

        
            all_labels.append(label_numpy)
            all_logits.append(logit_numpy)

        all_labels = np.concatenate(all_labels)
        all_logits = np.concatenate(all_logits)
        
        print((idx2label[lab], idx2label[log.argmax(-1)]) for (lab, log) in zip(all_labels, all_logits))
        
        one_hot_labels= np.zeros((len(all_labels), num_classes))
        one_hot_labels[np.arange(len(all_labels)), all_labels] = 1.0
        
        val_auc = sklearn.metrics.roc_auc_score(all_labels, all_logits, multi_class = 'ovo', labels = np.array([int(i) for i in range(num_classes)]))
        val_mrr = sklearn.metrics.label_ranking_average_precision_score(one_hot_labels, all_logits)
        
    
    return val_loss, val_f1, val_auc, val_mrr


def log_results(train_loss, train_f1, train_auc, train_mrr, val_loss, val_f1, val_auc, val_mrr, idx):

    """
    This function logs all the metric values to wandb project

        Parameters:
            ints/floats of values to be logged by wandb logger
            
        Returns:
            Nothing
    
    
    """
    
    metric_dict = {'train_loss': train_loss, 'train_f1': train_f1, 'train_auc': train_auc, 'train_mrr': train_mrr,
                  'val_loss': val_loss, 'val_f1': val_f1, 'val_auc': val_auc, 'val_mrr': val_mrr, 'epoch': idx}
    
    wandb.log(metric_dict)

Train the model on all 5 folds

In [None]:
result_list = []
for i in range(args.n_folds):
    result = train_fold(args, x_adj, x_feature, fold = i)
    result_list.append(result)

In [None]:
metrics = ['f1', 'auc', 'mrr']

for m in metrics:
    maxi = 0
    for idx, l in enumerate(result_list):
        maxi += max(l[m], key = lambda x: x[1])[1] / args.n_folds
    
    print(f'Average of max validation {m} over all folds: ', np.round(maxi, 3))

For interactive comparison between different runs, please see this link: https://wandb.ai/vbagal/gnn?workspace=user-vbagal

In [None]:
plot_results(result_list)

# Make predictions on test set

In [None]:
def test(args, n_classes):

    """
    Returns test dataframe with 'preds_list' and 'preds' as two new columns. 'preds_list' has a list for each description with 
    predictions sorted in descending order of their softmax score. It can be used for test mrr evaluation. 'preds' column has the first entry
    of the 'preds_list' list for each sample. 

        Parameters:
            args (class): Class containing variables specifying values necessary for training model
            n_classes (int): number of classes
            
        Returns:
            dataframe
    
    
    """

    num_classes = n_classes
    window_size = args.window_size

    print('building graphs for training')
    x_adj, x_feature = build_graph(start=len(traindf), end=len(traindf) + len(testdf), weighted_graph = True)
    
    testdataset = GraphDataset(x_adj, x_feature)
    testloader = GraphDataLoader(testdataset, batch_size = args.batch_size, shuffle = False)
    
    model = GATClassifier(args.embedding_dim, args.hidden_dim, args.num_heads, num_classes)
    model_list = load_models(model, args.n_folds)
    
    pred_list = []
    
    with torch.no_grad():
        for idx, G in enumerate(tqdm(testloader)):
            h = G.ndata['feat'].float()
            logits = 0
            for mod in model_list:
                log = mod(G, h)
                # blending of logits from all 5 models. This helps in getting more robust predictions. 
                logits += log.softmax(-1) / args.n_folds

            pred_soft = logits.detach().cpu().numpy()
            pred_list.append(pred_soft)

        preds = np.concatenate(pred_list)
    
    tags = []

    for sample in preds:
        sample = sample.argsort(-1)[ : :-1]
        x  = [idx2label[i] for i in sample]
        tags.append(f'{x}')
    
    
    testdf['preds_list'] = tags
    
    preds = preds.argmax(-1)
    preds = [idx2label[i] for i in preds]
    
    testdf['preds'] = preds
    
    return testdf

In [None]:
def load_models(model, folds):

    """
    Returns list of models loaded with pre-trained wegihts.
        Parameters:
            model (nn.Module): Class of model
            folds (int): number of folds. The number of models saved is equal to number of folds used for training. 
            
        Returns:
            list 
    
    
    """
    model_list = []
    
    for i in range(folds):
        print('Loading weights')
        model.load_state_dict(torch.load(f'/kaggle/working/fold-{i}.pt'))
        model.eval()
        model_list.append(model)
        
    return model_list

In [None]:
df = test(args, 62)

**This number is quite low which means that the words in the test dataset description are significantly different from those in the train dataset. Indeed, there is a significant distribution shift.**

In [None]:
check_train_test_word_overlap()

In [None]:
df.to_csv('submission.csv', index = False)

In [None]:
df.head()

To get one prediction list, use the following sample code.

In [None]:
subdf = pd.read_csv('./submission.csv')
one_predlist = eval(subdf['preds_list'][0])

In [None]:
subdf.head()