In this notebook, I consolidated original training data and annotated data. In addition, I generated some non_label data from possible combination pairs. 

## Read in data



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Each row in the dataframe consists of a news article, and a sentence in which a certain relationship was found (just as "invested_in", or "founded_by"). There were some patterns used to gather the data, so it might contain some noise. 

In [2]:
from tqdm import tqdm, trange
import collections
from sklearn.preprocessing import OneHotEncoder

In [3]:
import pandas as pd
import numpy as np
import re
import itertools

In [4]:
df_original = pd.read_csv("/content/drive/MyDrive/capstone/data_relation_cleaned.csv")
df_original.head(1)

Unnamed: 0.1,Unnamed: 0,Company A,Company B,Sentence,Type,Degree,Url,a_start,a_end,b_start,b_end,words_start,words_end,type_cleaned
0,0,Fortino Capital,Newion,After its rapid expansion from Luxembourg into...,Investment,indirect,https://www.eu-startups.com/2021/07/luxembourg...,138,152,158,163,"[0, 5, 9, 15, 25, 30, 41, 46, 54, 58, 71, 80, ...","[4, 8, 14, 24, 29, 40, 45, 53, 57, 70, 79, 84,...",Financial


In [5]:
df_annotated = pd.read_csv("/content/drive/MyDrive/capstone/data_expansion_cleaned.csv")
df_annotated.head(1)


Unnamed: 0,index,article_index,url,sentence,organization_a,organization_b,Financial,Technical,People,Partner,a_start,a_end,b_start,b_end
0,0,0.0,https://shorttermrentalz.com/news/operto-staym...,"Canada: Operto Guest Technologies, a provider ...",Operto Guest Technologies,STAYmyway,1.0,0.0,0.0,0.0,8.0,32.0,228.0,236.0


## Data Cleaning and feature engineering

In [6]:
len(df_original)

327

In [7]:
df_original.isnull().sum()

Unnamed: 0      0
Company A       0
Company B       0
Sentence        0
Type            2
Degree          0
Url             9
a_start         0
a_end           0
b_start         0
b_end           0
words_start     0
words_end       0
type_cleaned    1
dtype: int64

In [8]:
df_original = df_original.dropna(subset = ['Type'])

In [9]:
ohe = OneHotEncoder()
transformed = ohe.fit_transform(df_original[['type_cleaned']])
df_original[ohe.categories_[0]] = transformed.toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [10]:
df_original.head(1)

Unnamed: 0.1,Unnamed: 0,Company A,Company B,Sentence,Type,Degree,Url,a_start,a_end,b_start,b_end,words_start,words_end,type_cleaned,Financial,Partner,People,Technical
0,0,Fortino Capital,Newion,After its rapid expansion from Luxembourg into...,Investment,indirect,https://www.eu-startups.com/2021/07/luxembourg...,138,152,158,163,"[0, 5, 9, 15, 25, 30, 41, 46, 54, 58, 71, 80, ...","[4, 8, 14, 24, 29, 40, 45, 53, 57, 70, 79, 84,...",Financial,1.0,0.0,0.0,0.0


In [11]:
# fill the multi label data points
df_original.loc[df_original.Type == 'Investment/People', 'Financial'] = 1.0
df_original.loc[df_original.Type == 'Investment/People', 'Partner'] = 1.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [12]:
df_original.type_cleaned.value_counts().index

Index(['Financial', 'Technical', 'People', 'Partner'], dtype='object')

In [13]:
df = pd.DataFrame(columns = ["entity_a","entity_b","entity_spans","sentence",'Financial', 'Partner', 'People', 'Technical'])
df["entity_a"] = pd.concat([df_original['Company A'], df_annotated['organization_a']])
df["entity_b"] = pd.concat([df_original['Company B'], df_annotated['organization_b']])
df['sentence'] = pd.concat([df_original['Sentence'], df_annotated['sentence']])
df['Financial'] = pd.concat([df_original['Financial'], df_annotated['Financial']])
df['Partner'] =  pd.concat([df_original['Partner'], df_annotated['Partner']])
df['People'] =  pd.concat([df_original['People'], df_annotated['People']])
df['Technical'] =  pd.concat([df_original['Technical'], df_annotated['Technical']])

In [14]:
len(df)

719

In [15]:
df = df.reset_index(drop = True)

In [16]:
df.head()

Unnamed: 0,entity_a,entity_b,entity_spans,sentence,Financial,Partner,People,Technical
0,Fortino Capital,Newion,,After its rapid expansion from Luxembourg into...,1.0,0.0,0.0,0.0
1,Fortino Capital,Charles Souillard,,"As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
2,Fortino Capital,Miguel Valdes,,"As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
3,Fortino Capital,Autodesk,,Belgium's Oqton scores $40 million to 'disrupt...,0.0,0.0,1.0,0.0
4,Fortino Capital,SimplyDelivery,,"SimplyDelivery, the Berlin-based startup which...",1.0,0.0,0.0,0.0


In [17]:
from collections import Counter, defaultdict

## Generate no relationship pairs within sentences

In [18]:
distinct_sentences = df['sentence'].drop_duplicates().tolist()

In [19]:
sentence_dict = defaultdict(set)

In [20]:
for i in df.index:
    sentence = df['sentence'].iloc[i]
    entity_a = df['entity_a'].iloc[i]
    entity_b = df['entity_b'].iloc[i]
    sentence_dict[sentence].add(entity_a)
    sentence_dict[sentence].add(entity_b)


In [21]:
import itertools

In [22]:
pair_dict = {}
for key in sentence_dict:
    pairs = []
    for subset in itertools.combinations(sentence_dict[key], 2):
        pair_dict[subset] = key
    #     pairs.append(subset)
    # sentence_dict[key] = pairs

In [23]:
len(pair_dict)

1318

In [24]:
for i in df.index:
    
    pair_1 = (df['entity_a'].iloc[i], df['entity_b'].iloc[i])
    pair_2 = (df['entity_b'].iloc[i], df['entity_a'].iloc[i])
    if pair_1 in pair_dict:
        del pair_dict[pair_1] 
    if pair_2 in pair_dict:
        del pair_dict[pair_2]


In [25]:
len(pair_dict)

647

In [26]:
generated_entity_a = []
generated_entity_b = []
generated_sentences = []
for pair in pair_dict:
    generated_entity_a.append(pair[0])
    generated_entity_b.append(pair[1])
    generated_sentences.append(pair_dict[pair])

In [27]:
generated_df = pd.DataFrame(columns = ["entity_a","entity_b","entity_spans","sentence",'Financial', 'Partner', 'People', 'Technical'])
generated_df["entity_a"] = generated_entity_a
generated_df["entity_b"] = generated_entity_b
generated_df["sentence"] = generated_sentences 
generated_df['Financial'] = [0.0 for i in range(len(pair_dict))]
generated_df['Partner'] =  [0.0 for i in range(len(pair_dict))]
generated_df['People'] =  [0.0 for i in range(len(pair_dict))]
generated_df['Technical'] =  [0.0 for i in range(len(pair_dict))]


In [28]:
generated_df

Unnamed: 0,entity_a,entity_b,entity_spans,sentence,Financial,Partner,People,Technical
0,Miguel Valdes,Charles Souillard,,"As part of the transaction, Miguel Valdes and ...",0.0,0.0,0.0,0.0
1,Sandvik,Autodesk,,Belgium's Oqton scores $40 million to 'disrupt...,0.0,0.0,0.0,0.0
2,Pires Investments,Getvisibility,,"Certain existing investors, including Pires In...",0.0,0.0,0.0,0.0
3,Melita,EQT,,David and his team have done an excellent job ...,0.0,0.0,0.0,0.0
4,Blackrock Communications,MC Venture Partners,,The deal was announced in December between Mel...,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
642,Notion,Sentry,,"Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
643,Accel India,Tenable,,"Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
644,Accel India,Sentry,,"Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
645,Tenable,Sentry,,"Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0


In [29]:
df

Unnamed: 0,entity_a,entity_b,entity_spans,sentence,Financial,Partner,People,Technical
0,Fortino Capital,Newion,,After its rapid expansion from Luxembourg into...,1.0,0.0,0.0,0.0
1,Fortino Capital,Charles Souillard,,"As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
2,Fortino Capital,Miguel Valdes,,"As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
3,Fortino Capital,Autodesk,,Belgium's Oqton scores $40 million to 'disrupt...,0.0,0.0,1.0,0.0
4,Fortino Capital,SimplyDelivery,,"SimplyDelivery, the Berlin-based startup which...",1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
714,Akto,Sentry,,"Akto, a Palo Alto, California-based startup bu...",0.0,0.0,1.0,0.0
715,Util,Interos,,"Earlier this year, Mike Goynes joined Util as ...",0.0,0.0,1.0,0.0
716,Revuze,PSG,,"Revuze, a Netanya, Israel-based provider of re...",1.0,0.0,0.0,0.0
717,Revuze,NPD Group,,"Revuze, a Netanya, Israel-based provider of re...",0.0,0.0,1.0,0.0


In [30]:
consolidated = pd.concat([df, generated_df])

## Calculate entity spans

In [37]:
df = consolidated.reset_index( drop = True)

In [38]:
drop_index = []
for ind in df.index:
    sentence = df.iloc[ind,3]
    wA = df.iloc[ind,0].strip()
    wB = df.iloc[ind,1].strip()
    if re.search(wA, sentence) is not None and re.search(wB, sentence) is not None:
      #re.search returns None if word not found
      startA = re.search(wA,sentence).start()
      startB = re.search(wB,sentence).start()
      endA = re.search(wA,sentence).end() - 1
      endB = re.search(wB,sentence).end() - 1

      df["entity_spans"].iloc[ind]=list([(startA, endA), (startB, endB)])
    else:
        drop_index.append(ind)

df = df.drop(index = drop_index)

    

In [39]:
df

Unnamed: 0,entity_a,entity_b,entity_spans,sentence,Financial,Partner,People,Technical
0,Fortino Capital,Newion,"[(138, 152), (158, 163)]",After its rapid expansion from Luxembourg into...,1.0,0.0,0.0,0.0
1,Fortino Capital,Charles Souillard,"[(128, 142), (46, 62)]","As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
2,Fortino Capital,Miguel Valdes,"[(128, 142), (28, 40)]","As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
3,Fortino Capital,Autodesk,"[(288, 302), (166, 173)]",Belgium's Oqton scores $40 million to 'disrupt...,0.0,0.0,1.0,0.0
4,Fortino Capital,SimplyDelivery,"[(230, 244), (0, 13)]","SimplyDelivery, the Berlin-based startup which...",1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1361,Notion,Sentry,"[(233, 238), (304, 309)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
1362,Accel India,Tenable,"[(143, 153), (271, 277)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
1363,Accel India,Sentry,"[(143, 153), (304, 309)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
1364,Tenable,Sentry,"[(271, 277), (304, 309)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0


In [51]:
df = df.reset_index(drop = True)

In [62]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 0)
split = splitter.split(df, groups=df['sentence'])
train_inds, valid_inds = next(split)

train = df.iloc[train_inds]
validation = df.iloc[valid_inds]

In [63]:
train

Unnamed: 0,entity_a,entity_b,entity_spans,sentence,Financial,Partner,People,Technical
0,Fortino Capital,Newion,"[(138, 152), (158, 163)]",After its rapid expansion from Luxembourg into...,1.0,0.0,0.0,0.0
1,Fortino Capital,Charles Souillard,"[(128, 142), (46, 62)]","As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
2,Fortino Capital,Miguel Valdes,"[(128, 142), (28, 40)]","As part of the transaction, Miguel Valdes and ...",0.0,0.0,1.0,0.0
4,Fortino Capital,SimplyDelivery,"[(230, 244), (0, 13)]","SimplyDelivery, the Berlin-based startup which...",1.0,0.0,0.0,0.0
5,Fortino Capital,Melita,"[(362, 376), (332, 337)]",David and his team have done an excellent job ...,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1271,Notion,Sentry,"[(233, 238), (304, 309)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
1272,Accel India,Tenable,"[(143, 153), (271, 277)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
1273,Accel India,Sentry,"[(143, 153), (304, 309)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0
1274,Tenable,Sentry,"[(271, 277), (304, 309)]","Akto, a Palo Alto, California-based startup bu...",0.0,0.0,0.0,0.0


In [55]:
print('train Financial ratio:',len(train[train['Financial'] == 1.0])/len(train))
print('train Partner ratio:',len(train[train['Partner'] == 1.0])/len(train))
print('train People ratio:',len(train[train['People'] == 1.0])/len(train))
print('train Technical ratio:',len(train[train['Technical'] == 1.0])/len(train))

train Financial ratio: 0.209009009009009
train Partner ratio: 0.09009009009009009
train People ratio: 0.06666666666666667
train Technical ratio: 0.15585585585585585


In [65]:
print('validation Financial ratio:',len(validation[validation['Financial'] == 1.0])/len(validation))
print('validation Partner ratio:',len(validation[validation['Partner'] == 1.0])/len(validation))
print('validation People ratio:',len(validation[validation['People'] == 1.0])/len(validation))
print('validation Technical ratio:',len(validation[validation['Technical'] == 1.0])/len(validation))

validation Financial ratio: 0.3433734939759036
validation Partner ratio: 0.09036144578313253
validation People ratio: 0.08433734939759036
validation Technical ratio: 0.19879518072289157


In [66]:
pd.Series(valid_inds).to_csv('/content/drive/MyDrive/capstone/valid_ids.csv', index = False, header = False)

In [69]:
df.to_csv('/content/drive/MyDrive/capstone/Cleaned_full_data.csv')

In [68]:
train.to_csv('/content/drive/MyDrive/capstone/Cleaned_train_data.csv')

In [67]:
validation.to_csv('/content/drive/MyDrive/capstone/Cleaned_valid_data.csv')