In [1]:
import pyLDAvis
import pyLDAvis.gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

import nltk
from nltk.tokenize import word_tokenize
import string
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

from cleanlab import Datalab

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/stud/t/ts218/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Approach: LDA Topic modelling

In [None]:
df_full = pd.read_csv('data/small_labels/dataset_full.csv')
# topicmodelling
df_full['tokenized'] = df_full['description'].apply(word_tokenize)
df_full['lower'] = df_full['tokenized'].apply(lambda x: [word.lower() for word in x])
# remove punctuation and stopwords
punc = string.punctuation
df_full['no_punc'] = df_full['lower'].apply(lambda x: [word for word in x if word not in punc])
# additionaly remove aditional punctuation found in topic modelling
add = ['"', '``', "''", '’', "'s"]
df_full['no_punc'] = df_full['no_punc'].apply(lambda x: [word for word in x if word not in add])
stop_words = set(stopwords.words('english'))
df_full['stopwords_removed'] = df_full['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])

df_full.head()

Unnamed: 0,description,industry,labels,len_description,tokenized,lower,no_punc,stopwords_removed
0,The company develops software for the analysis...,Production & Supply Chain,8,215,"[The, company, develops, software, for, the, a...","[the, company, develops, software, for, the, a...","[the, company, develops, software, for, the, a...","[company, develops, software, analysis, evalua..."
1,The company is developing an information and p...,People & Learning,7,160,"[The, company, is, developing, an, information...","[the, company, is, developing, an, information...","[the, company, is, developing, an, information...","[company, developing, information, placement, ..."
2,The company develops bicycle accessories that ...,Retail & Living,10,74,"[The, company, develops, bicycle, accessories,...","[the, company, develops, bicycle, accessories,...","[the, company, develops, bicycle, accessories,...","[company, develops, bicycle, accessories, yet,..."
3,The company is developing sensor technology th...,unknown,12,76,"[The, company, is, developing, sensor, technol...","[the, company, is, developing, sensor, technol...","[the, company, is, developing, sensor, technol...","[company, developing, sensor, technology, yet,..."
4,The company is developing an as yet unknown mo...,Retail & Living,10,124,"[The, company, is, developing, an, as, yet, un...","[the, company, is, developing, an, as, yet, un...","[the, company, is, developing, an, as, yet, un...","[company, developing, yet, unknown, mobile, ap..."


In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(df_full['stopwords_removed'], min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[df_full['stopwords_removed']], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# Form Bigrams
df_full['stopwords_removed_bigram'] = make_bigrams(df_full['stopwords_removed'])


In [None]:
# pos tagging
df_full['pos_tags'] = df_full['stopwords_removed_bigram'].apply(nltk.tag.pos_tag)

# transform pos tag to wordnet format for lemmatization
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
df_full['wordnet_pos'] = df_full['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

# lemmatization with nltk lemmatizer
wnl = WordNetLemmatizer()
df_full['lemmatized'] = df_full['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

df_full.head()

Unnamed: 0,description,industry,labels,len_description,tokenized,lower,no_punc,stopwords_removed,stopwords_removed_bigram,pos_tags,wordnet_pos,lemmatized
0,The company develops software for the analysis...,Production & Supply Chain,8,215,"[The, company, develops, software, for, the, a...","[the, company, develops, software, for, the, a...","[the, company, develops, software, for, the, a...","[company, develops, software, analysis, evalua...","[company, develops, software, analysis, evalua...","[(company, NN), (develops, VBZ), (software, NN...","[(company, n), (develops, v), (software, n), (...","[company, develop, software, analysis, evaluat..."
1,The company is developing an information and p...,People & Learning,7,160,"[The, company, is, developing, an, information...","[the, company, is, developing, an, information...","[the, company, is, developing, an, information...","[company, developing, information, placement, ...","[company, developing, information, placement, ...","[(company, NN), (developing, VBG), (informatio...","[(company, n), (developing, v), (information, ...","[company, develop, information, placement, pla..."
2,The company develops bicycle accessories that ...,Retail & Living,10,74,"[The, company, develops, bicycle, accessories,...","[the, company, develops, bicycle, accessories,...","[the, company, develops, bicycle, accessories,...","[company, develops, bicycle, accessories, yet,...","[company, develops, bicycle, accessories, yet_...","[(company, NN), (develops, VBZ), (bicycle, NN)...","[(company, n), (develops, v), (bicycle, n), (a...","[company, develop, bicycle, accessory, yet_kno..."
3,The company is developing sensor technology th...,unknown,12,76,"[The, company, is, developing, sensor, technol...","[the, company, is, developing, sensor, technol...","[the, company, is, developing, sensor, technol...","[company, developing, sensor, technology, yet,...","[company, developing, sensor, technology, yet_...","[(company, NN), (developing, VBG), (sensor, NN...","[(company, n), (developing, v), (sensor, n), (...","[company, develop, sensor, technology, yet_kno..."
4,The company is developing an as yet unknown mo...,Retail & Living,10,124,"[The, company, is, developing, an, as, yet, un...","[the, company, is, developing, an, as, yet, un...","[the, company, is, developing, an, as, yet, un...","[company, developing, yet, unknown, mobile, ap...","[company, developing, yet, unknown, mobile, ap...","[(company, NN), (developing, VBG), (yet, RB), ...","[(company, n), (developing, v), (yet, r), (unk...","[company, develop, yet, unknown, mobile, app, ..."


In [None]:
# slim down dataframe for topic modelling
df_top = df_full[['description', 'industry', 'labels', 'lemmatized']]

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(df_top['lemmatized'])
# remove extremes from dictionary (very frequent and very rare words)
id2word.filter_extremes(no_below=5, no_above=0.2)
# Create Corpus
texts = df_top['lemmatized']
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 2), (9, 1), (10, 1), (11, 1)]]


In [None]:
# Build LDA model
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=13, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=100,
                                           per_word_topics=True,
                                           workers=4)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.616690909397126

Coherence Score:  0.3958568121604771


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
df_top['bow'] = df_top['lemmatized'].apply(lambda x: id2word.doc2bow(x))
df_top['gensim_topic_vectors'] = df_top['bow'].apply(lambda x: lda_model.get_document_topics(x))
df_top['gensim_topic'] = df_top['gensim_topic_vectors'].apply(lambda x: max(x, key=lambda item: item[1])[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top['bow'] = df_top['lemmatized'].apply(lambda x: id2word.doc2bow(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top['gensim_topic_vectors'] = df_top['bow'].apply(lambda x: lda_model.get_document_topics(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top['gensim_topic'] = df_top['ge

In [None]:
df_top.to_csv('data/exploration/lda_13_topics.csv')
df_top.head()

Unnamed: 0,description,industry,labels,lemmatized,bow,gensim_topic_vectors,gensim_topic
0,The company develops software for the analysis...,Production & Supply Chain,8,"[company, develop, software, analysis, evaluat...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[(5, 0.20314579), (6, 0.3538521), (8, 0.3917175)]",8
1,The company is developing an information and p...,People & Learning,7,"[company, develop, information, placement, pla...","[(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), ...","[(0, 0.37856132), (9, 0.44602588), (10, 0.1054...",9
2,The company develops bicycle accessories that ...,Retail & Living,10,"[company, develop, bicycle, accessory, yet_kno...","[(22, 1), (23, 1), (24, 1), (25, 1)]","[(0, 0.015388162), (1, 0.015388162), (2, 0.015...",6
3,The company is developing sensor technology th...,unknown,12,"[company, develop, sensor, technology, yet_kno...","[(10, 1), (24, 1), (25, 1), (26, 1)]","[(0, 0.015384698), (1, 0.015384698), (2, 0.015...",6
4,The company is developing an as yet unknown mo...,Retail & Living,10,"[company, develop, yet, unknown, mobile, app, ...","[(7, 1), (27, 1), (28, 1), (29, 1), (30, 1), (...","[(3, 0.57014114), (9, 0.25808263), (10, 0.1018...",3


## 2. Approach: Cleanlab

following this example: https://docs.cleanlab.ai/stable/tutorials/datalab/text.html#

In [9]:
# read in data and store texts and labels as np array for cleanlab
data = pd.read_csv('../data/data_augmentation_sd/dataset_full.csv')
data = data.dropna()

raw_texts = data['description'].astype(str).to_list()
labels = data['labels'].to_list()
startup_ids = data['startup_ID'].to_list()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5343 entries, 0 to 5342
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   startup_ID   5343 non-null   int64 
 1   description  5343 non-null   object
 2   industry     5343 non-null   object
 3   labels       5343 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 167.1+ KB


In [3]:
num_classes = len(set(labels))
print(f"Classes: {set(labels)}")
i = 0
print(f"Example Label: {labels[i]}")
print(f"Example Text: {raw_texts[i]}")

Classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
Example Label: 8
Example Text: The company develops software for the analysis and evaluation of multi-stage processes at the sensor level in the field of industrial production. This is intended to optimize production processes and reduce rejects.


In [4]:
transformer = SentenceTransformer('all-MiniLM-L6-v2')
text_embeddings = transformer.encode(raw_texts)

In [5]:
len(text_embeddings)

5343

In [6]:
model = LogisticRegression(max_iter=400)

pred_probs = cross_val_predict(model, text_embeddings, labels, method="predict_proba")

data_dict = {"texts": raw_texts, "labels": labels}
lab = Datalab(data_dict, label_name="labels")
lab.find_issues(pred_probs=pred_probs, features=text_embeddings)

Finding label issues ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to

In [7]:
lab.report()

Here is a summary of the different kinds of issues found in the data:

    issue_type  num_issues
         label        1361
near_duplicate         417
       outlier           0

Dataset Information: num_examples: 5343, num_classes: 13


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 1361
Overall dataset quality in terms of this issue: 0.7013

Examples representing most severe instances of this issue:
      is_label_issue  label_score  given_label  predicted_label
5098            True     0.001504            1               10
2604            True     0.001985           12                7
3393            True     0.002005           12                1
3466            True     0.002038            9               12
3602            True     0.002057            3             

In [8]:
label_issues = lab.get_issues("label")
label_issues.head()

Unnamed: 0,is_label_issue,label_score,given_label,predicted_label
0,False,0.804829,8,8
1,False,0.874361,7,7
2,False,0.439456,10,10
3,False,0.623914,8,8
4,False,0.547902,10,10


In [16]:
data_with_suggested_labels = pd.DataFrame(
    {"startup_ID": startup_ids, "is_label_issue": label_issues["is_label_issue"], "text": raw_texts, "given_label": labels, "suggested_label": label_issues["predicted_label"]}
)
data_with_suggested_labels.to_csv('../data/label_correction/label_correction_suggestion.csv', index=False)

In [17]:
# replace label number with label text
label_text_map= {
        "AR & VR": 0,
        "Data & Computing": 1,
        "Energy & Climate": 2,
        "FinTech & LegalTech": 3,
        "Materials & Deeptech": 4,
        "Med & Health": 5,
        "Mobility & Transportation": 6,
        "People & Learning": 7,
        "Production & Supply Chain": 8,
        "Property & Construction": 9,
        "Retail & Living": 10,
        "Sustainability & GreenTech": 11,
        "Unknown": 12
    }

def transform_label(label):
    return list(label_text_map.keys())[list(label_text_map.values()).index(label)]

data_with_suggested_labels['given_label'] = data_with_suggested_labels['given_label'].apply(lambda x: transform_label(x))
data_with_suggested_labels['suggested_label'] = data_with_suggested_labels['suggested_label'].apply(lambda x: transform_label(x))

data_with_suggested_labels.to_csv('../data/label_correction/label_correction_suggestion_readable.csv', index=False)
