### **0. Set-up**

In [1]:
import pandas as pd

In [2]:
# Load dataframe
nl_simlex = pd.read_csv("../data/dataset/nl-simlex-999.txt", delimiter='\t')
nl_simlex_questionnaire = pd.read_csv("../data/dataset/nl-simlex-999-questionnaire.csv")

# Show dataframe
nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS
0,oud,nieuw,1.94,A
1,slim,intelligent,8.19,A
2,hard,moeilijk,4.46,A
3,gelukkig,vrolijk,6.49,A
4,hard,stoer,5.69,A
...,...,...,...,...
994,samenvoegen,verwerven,3.89,V
995,sturen,bijwonen,1.85,V
996,verzamelen,bijwonen,1.06,V
997,opnemen,intrekken,2.29,V


### **1. Rename Columns and Values**

In [3]:
# Replace "taxi's" with "taxis" to prevent error
nl_simlex['word2'] = nl_simlex['word2'].replace("taxi's", 'taxis')

# Show results
nl_simlex[(nl_simlex['word1'] == 'taxi')]

Unnamed: 0,word1,word2,SimLex999,POS
143,taxi,taxis,8.17,N
553,taxi,bus,2.04,N


In [4]:
# Replace "taxi's" with "taxis" to prevent error
nl_simlex_questionnaire['word2'] = nl_simlex_questionnaire['word2'].replace("taxi's", 'taxis')

# Show results
nl_simlex_questionnaire[(nl_simlex_questionnaire['word1'] == 'taxi')]

Unnamed: 0,word1,word2,Simlex999,Std Deviation,Variance,Count,POS
259,taxi,taxis,8.17,2.5,6.27,14,N
887,taxi,bus,2.04,1.77,3.13,17,N


### **2. Duplicates**

In [5]:
# Create combined column
nl_simlex['Combined_Columns'] = nl_simlex['word1'] + '_' + nl_simlex['word2']

# Check for duplicate word pairws
duplicate_combinations = nl_simlex.duplicated(subset='Combined_Columns', keep=False)

# Print rows with duplicate word pairs
print(nl_simlex[duplicate_combinations])

     word1      word2  SimLex999 POS  Combined_Columns
11  slecht  vreselijk       5.91   A  slecht_vreselijk
13  slecht  vreselijk       7.53   A  slecht_vreselijk


In [6]:
# Find indices with the duplicate word pairs 'slecht', 'vreselijk'
indices_to_drop = nl_simlex[(nl_simlex['word1'] == 'slecht') & (nl_simlex['word2'] == 'vreselijk')].index

# Drop these rows and reset the index
nl_simlex.drop(indices_to_drop, inplace=True)
nl_simlex.reset_index(drop=True, inplace=True)

# Show results
nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS,Combined_Columns
0,oud,nieuw,1.94,A,oud_nieuw
1,slim,intelligent,8.19,A,slim_intelligent
2,hard,moeilijk,4.46,A,hard_moeilijk
3,gelukkig,vrolijk,6.49,A,gelukkig_vrolijk
4,hard,stoer,5.69,A,hard_stoer
...,...,...,...,...,...
992,samenvoegen,verwerven,3.89,V,samenvoegen_verwerven
993,sturen,bijwonen,1.85,V,sturen_bijwonen
994,verzamelen,bijwonen,1.06,V,verzamelen_bijwonen
995,opnemen,intrekken,2.29,V,opnemen_intrekken


Double check for duplicates

In [7]:
# Create combined column
nl_simlex['Combined_Columns'] = nl_simlex['word1'] + '_' + nl_simlex['word2']

# Check for duplicate word pairs
duplicate_combinations = nl_simlex.duplicated(subset='Combined_Columns', keep=False)

# Print rows with duplicate word pairs
print(nl_simlex[duplicate_combinations])

Empty DataFrame
Columns: [word1, word2, SimLex999, POS, Combined_Columns]
Index: []


In [8]:
# Drop irrelevant column
nl_simlex = nl_simlex.drop("Combined_Columns", axis=1)

### **3. Standard Deviation**

In [9]:
# Rename SimLex columns
nl_simlex_questionnaire = nl_simlex_questionnaire.rename(columns={'Simlex999': 'SimLex999'})

# Drop irrelevant column
nl_simlex_questionnaire = nl_simlex_questionnaire.drop("POS", axis=1)

# Enrich with standard deviation
merged = pd.merge(nl_simlex, nl_simlex_questionnaire, on=['word1', 'word2', 'SimLex999'], how='left') # also join on SimLex because of duplicate word pairs

# Show results
merged

Unnamed: 0,word1,word2,SimLex999,POS,Std Deviation,Variance,Count
0,oud,nieuw,1.94,A,2.42,5.87,16
1,slim,intelligent,8.19,A,1.54,2.38,15
2,hard,moeilijk,4.46,A,3.18,10.14,10
3,gelukkig,vrolijk,6.49,A,2.15,4.62,17
4,hard,stoer,5.69,A,2.94,8.64,13
...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,3.89,V,3.11,9.65,14
993,sturen,bijwonen,1.85,V,1.95,3.79,16
994,verzamelen,bijwonen,1.06,V,1.71,2.93,15
995,opnemen,intrekken,2.29,V,2.67,7.13,11


### **4. Export**

In [10]:
# Select columns
merged = merged[['word1', 'word2', 'SimLex999', 'Std Deviation', 'POS']]

# Rename column
merged = merged.rename(columns={'Std Deviation': 'std_dev'})

# Show results
merged

Unnamed: 0,word1,word2,SimLex999,std_dev,POS
0,oud,nieuw,1.94,2.42,A
1,slim,intelligent,8.19,1.54,A
2,hard,moeilijk,4.46,3.18,A
3,gelukkig,vrolijk,6.49,2.15,A
4,hard,stoer,5.69,2.94,A
...,...,...,...,...,...
992,samenvoegen,verwerven,3.89,3.11,V
993,sturen,bijwonen,1.85,1.95,V
994,verzamelen,bijwonen,1.06,1.71,V
995,opnemen,intrekken,2.29,2.67,V


In [11]:
# Export cleaned DataFrame to CSV
merged.to_csv('../data/dataset/cleaned-nl-simlex-999.csv', index=False)