In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/Colab\ Notebooks/Synset_Grouping

/content/drive/.shortcut-targets-by-id/1GSBPdsf5hLXUWvkP6mXrY40FQGH44Er5/Synset_Grouping


In [None]:
ls

1_prepare_synset_data.ipynb
2_prepare_synset_data_part2.ipynb
3_group_synset.ipynb
4_label_cluster_title_to_topic4.ipynb
cluster_matrix_topic4_5000sents.csv
preprocessed_topic4_corpus.csv
preprocessed_topic4_verb_list.csv
preprocessed_topic4_vp_synset.csv
topic4_dataset.csv
topic4_sent_corpus.csv
topic4_verb_cluster_data_5000sents.csv
topic4_verb_list_5000sents.csv
vp_synset_data_topic4_5000sents.csv
wrong_4_label_cluster_title_to_topic4.ipynb


- input: <unique,valid> lemmatized verb list with frequency and synset

- output: 
    - input file updated with cluster_id and cluster_title
    - cluster file with <cluster_id, cluster_title, verb_list, actua_data_list: [(lemma, freq)]>

In [None]:
import pandas as pd
import numpy as np
import time

In [None]:
start_time = time.time()

In [None]:
df = pd.read_csv('preprocessed_topic4_vp_synset.csv')
print('total data: ', len(df))
df.head()

total data:  2830


Unnamed: 0.1,Unnamed: 0,Lemmatized Verb,Frequency,Synset,Status
0,0,-1130,2,,invalid
1,1,-2665,1,,invalid
2,2,-adjusted,1,,invalid
3,3,-appearing,1,,invalid
4,4,-approved,1,,invalid


### consider for only valid verbs

In [None]:
valid_df = df[df['Status'] == 'valid']
print('total data: ', len(valid_df))

valid_df.columns = ['id', 'Lemmatized Verb', 'Frequency', 'Synset', 'Status']
valid_df.head()

total data:  1530


Unnamed: 0,id,Lemmatized Verb,Frequency,Synset,Status
36,36,abandon,4,['give_up' 'vacate' 'abandon' 'forsake' 'deser...,valid
38,38,abate,3,['let_up' 'slake' 'die_away' 'slack' 'slack_of...,valid
39,39,abbreviate,1,['contract' 'shorten' 'cut' 'foreshorten' 'abr...,valid
40,40,abide,1,['digest' 'stick_out' 'support' 'suffer' 'tole...,valid
42,42,ablate,1,['ablate'],valid


In [None]:
'''
subset = 500
valid_df = valid_df[:subset]

valid_df.tail()
'''

'\nsubset = 500\nvalid_df = valid_df[:subset]\n\nvalid_df.tail()\n'

In [None]:
def str2list(data):
    remove_list = ['[', ']', '\'', '\n']
    for symbol in remove_list:
        data = data.replace(symbol, '')
    lst = data.split(' ')
    
    return lst

In [None]:
# initialize empty df
cluster_df = pd.DataFrame()
cluster_df['cluster_id'] = ''
cluster_df['cluster_title'] = ''
cluster_df['verb_list'] = ''
cluster_df['actual_data_list'] = ''

valid_df['cluster_id'] = ''
valid_df['cluster_title'] = ''
#valid_df['match_cluster_ids'] = ''
#valid_df['match_synset_counts'] = ''

cluster_id = 0

# looping all valid verbs
for verb_idx in valid_df['id']:
    
    # get lemma and synset of each valid verb
    lemma = valid_df['Lemmatized Verb'][verb_idx]
    lemma_synset = str2list(valid_df['Synset'][verb_idx])
    lemma_freq = valid_df['Frequency'][verb_idx]
    lemma_freq_tuple = (lemma, lemma_freq)
    #print('lemma: ', lemma)
    
    # flag for cluster found or not
    cluster_found = 0
    
    # loop all currently created clusters to search for lemma
    found_cid_list = []
    intersect_synset_count_list = []
    
    for c_idx in cluster_df['cluster_id']:
        
        # get synset list in each cluster
        cluster_synset = cluster_df[cluster_df['cluster_id'] == c_idx]['verb_list'].values[0]
        
        # count # of clusters in which lemma appeared in it's synset
        if lemma in cluster_synset:
            cluster_found += 1
            
            # keep cluster_idx and overlap synset count
            intersect_synset = list(set(lemma_synset) & set(cluster_synset))
            found_cid_list.append(c_idx)
            intersect_synset_count_list.append(len(intersect_synset))
            
            #print('lemma_synset: ', lemma_synset)
            #print('cluster_synset: ', cluster_synset)
            #print('Intersect_synset: ', intersect_synset)
            
            #print('found_cid_list: \t\t', found_cid_list)
            #print('intersect_synset_count_list: \t', intersect_synset_count_list)
            
    #print('cluster found: ', cluster_found)
    
    # if lemma is not in any of existing cluster
    if not cluster_found:
        cluster_df.loc[cluster_id,'cluster_title'] = lemma
        cluster_df.loc[cluster_id,'cluster_id'] = cluster_id
        cluster_df.loc[cluster_id,'verb_list'] = np.array(lemma_synset)
        cluster_df.loc[cluster_id,'actual_data_list'] = np.array([lemma_freq_tuple])
        cluster_id = cluster_id+1
        
        valid_df.loc[verb_idx,'cluster_id'] = cluster_id-1
        #print('assigned to cluster (not found): ', cluster_id-1)
        
    else:        
        # choose highest overlap synset count and respective cluster. Take that cluster to be merged
        max_intersect_synset_count = max(intersect_synset_count_list)
        found_cid_idx = intersect_synset_count_list.index(max_intersect_synset_count)
        best_match_cid = found_cid_list[found_cid_idx]
        
        # get cluster_synset and existing data_list of best_match_cid
        best_match_cluster_synset = cluster_df[cluster_df['cluster_id'] == best_match_cid]['verb_list'].values[0].tolist()
        existing_data_list = cluster_df[cluster_df['cluster_id'] == best_match_cid]['actual_data_list'].values[0].tolist()

        # append cluster_verb_list/ modified cluster_synset
        modified_synset = list(set(best_match_cluster_synset + lemma_synset))
        cluster_df.loc[best_match_cid,'verb_list'] = np.array(modified_synset)

        # modify actual_data_list
        modified_data_list = existing_data_list + [lemma_freq_tuple]
        cluster_df.loc[best_match_cid,'actual_data_list'] = np.array(modified_data_list)
        
        #print('chosen cid: ', best_match_cid)
        
        valid_df.loc[verb_idx,'cluster_id'] = best_match_cid
        #valid_df.loc[verb_idx,'match_cluster_ids'] = np.array(found_cid_list)
        #valid_df.loc[verb_idx,'match_synset_counts'] = np.array(intersect_synset_count_list)
        #print('assigned to cluster: ', best_match_cid)
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


### assigning cluster_title according to freq

In [None]:

for c_id in cluster_df['cluster_id']:
    #print('cluster_id: ', c_id)
    actual_data_list = cluster_df[cluster_df['cluster_id'] == c_id]['actual_data_list'].values[0].tolist()
    #print('actual_data_list: ', actual_data_list, 'length: ', len(actual_data_list))
    
    if len(actual_data_list) > 1:
        data_list = []
        freq_list = []
        for len_idx in range(len(actual_data_list)):
            data, freq = actual_data_list[len_idx]
            data_list.append(data)
            freq_list.append(freq)
            
        freq_list = list(map(int, freq_list))
        #print('data_list: ', data_list)
        #print('freq_list: ', freq_list)
        #print('actual_data_list: ', actual_data_list, 'length: ', len(actual_data_list))
        
        max_freq = max(freq_list)
        max_idx = freq_list.index(max_freq)
        title_data = data_list[max_idx]
        
        #print('max_freq_value: ', max_freq)
        #print('max_idx: ', max_idx)
        #print('title_data: ', title_data)
        
        cluster_df.loc[c_id,'cluster_title'] = title_data

In [None]:
cluster_df['member_lemma'] = cluster_df['actual_data_list'].apply(lambda x: x[:,0])

In [None]:
cluster_df[:20]

Unnamed: 0,cluster_id,cluster_title,verb_list,actual_data_list,member_lemma
0,0,abandon,"[give_up, vacate, abandon, forsake, desert, em...","[[abandon, 4]]",[abandon]
1,1,abate,"[let_up, slake, die_away, slack, slack_off, ab...","[[abate, 3]]",[abate]
2,2,contract,"[get, urge_on, peg_down, subscribe, determine,...","[[abbreviate, 1], [abridge, 6], [contract, 55]...","[abbreviate, abridge, contract, narrow, press,..."
3,3,induce,"[furnish, get, substantiate, weather, verbalis...","[[abide, 1], [bear, 79], [behave, 2], [birth, ...","[abide, bear, behave, birth, carry, conduct, c..."
4,4,ablate,[ablate],"[[ablate, 1]]",[ablate]
5,5,abolish,"[abolish, get_rid_of]","[[abolish, 7]]",[abolish]
6,6,abort,[abort],"[[abort, 2]]",[abort]
7,7,abrogate,[abrogate],"[[abrogate, 11]]",[abrogate]
8,8,reverse,"[filch, get_up, wind, annul, turn_back, hook, ...","[[abstract, 8], [lift, 1], [reverse, 30]]","[abstract, lift, reverse]"
9,9,accelerate,"[quicken, speed_up, accelerate, speed]","[[accelerate, 66]]",[accelerate]


In [None]:
print("total clusters: ", len(cluster_df))

total clusters:  717


In [None]:
cluster_df.to_csv('preprocessed_topic4_cluster_matrix.csv')

In [None]:
end_time = time.time()

print('Estimate runtime: ', (end_time-start_time)/60, ' minutes.')

Estimate runtime:  5.733248972892762  minutes.


In [None]:
# Estimate runtime:  1.5818639238675436  minutes.

In [None]:
# all
# Estimate runtime:  5.733248972892762  minutes.