In [122]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import matplotlib.pyplot as plt
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /home/riya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/riya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/riya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [123]:
# Define a custom function to remove word before 1st space and excess white spaces
def process_string(input_string):
    # Find the index of the first space
    first_space_index = input_string.find(' ')
    
    # Remove the word before the first space
    if first_space_index != -1:
        input_string = input_string[first_space_index+1:]
    
    # Remove excess white spaces
    input_string = ' '.join(input_string.split())
    
    return input_string

In [124]:

# Creating our tokenizer function
def cleaning(sentence):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    wn = nltk.WordNetLemmatizer()

    cleaned = []
    for i in range(len(sentence)):
        letters = re.sub('[^a-zA-Z]', ' ', sentence[i])
        letters = letters.lower().split() 
        letters = [wn.lemmatize(w) for w in letters if w not in stopwords ]
        letters_list = list(set(letters))
        cleaned.append(letters_list)

    return cleaned

In [125]:

# Creating our tokenizer function
def concat_cleaning(sentence):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    wn = nltk.WordNetLemmatizer()

    cleaned = []
    for i in range(len(sentence)):
        letters = re.sub('[^a-zA-Z]', ' ', sentence[i])
        letters = letters.lower().split() 
        letters = [wn.lemmatize(w) for w in letters if w not in stopwords ]
        
        letters = ' '.join(letters)
        cleaned.append(letters)

    return cleaned

In [126]:
def concatenate_table_name(df):
    for index, row in df.iterrows():
        if ',' not in row['Description']:
            df.at[index, 'Description'] += ', ' + row['Table_name']
    return df


In [127]:
def count_common_words(row, df_other):
    set_1 = set(row)
    common_words = []
    for _, other_row in df_other.iterrows():
        set_2 = set(other_row['new_description'])
        common_words.append(len(set_1.intersection(set_2)))
    return common_words

In [128]:
def find_similarity_sdg(similarity_matrix, threshold, chapter_df, sdg_df):
    top_n_similarities = []
    top_n_attr = []
    top_n_chapter_id = []
    top_n_chapter_name = []
    top_n_table_id = []
    top_n_table_name = []
    top_n_count = []
    top_n_description = []
    
    n = threshold
    for index, row in similarity_matrix.iterrows():
        top_n_indices = sorted(range(len(row)), key=lambda i: row.values[i], reverse=True)[:n]
        
        # Get top n values, chapters, and chapter names
        values = [row.iloc[i] for i in top_n_indices]
        attr = [similarity_matrix.columns[i] for i in top_n_indices]
        attr_des = [chapter_df['Description'].loc[i] for i in top_n_indices]
        chapter_id = [chapter_df['Chapter_id'].loc[i] for i in top_n_indices]
        chapter_name = [chapter_df['Chapter_name'].loc[i] for i in top_n_indices]   
        table_id = [chapter_df['Table_id'].loc[i] for i in top_n_indices]
        table_name = [chapter_df['Table_name'].loc[i] for i in top_n_indices]   


        # Append to respective lists
        top_n_count.append(len(attr))
        top_n_similarities.append(values)
        top_n_attr.append(attr)
        top_n_description.append(attr_des)
        top_n_chapter_id.append(chapter_id)
        top_n_chapter_name.append(chapter_name)
        top_n_table_id.append(table_id)
        top_n_table_name.append(table_name)
    
    # Assign lists to the SDG DataFrame
    sdg_df['top_n_count'] = top_n_count
    sdg_df['top_n_attr'] = top_n_attr
    sdg_df['top_n_table_id'] = top_n_table_id
    sdg_df['top_n_table_name'] = top_n_table_name
    sdg_df['top_n_chapter_id'] = top_n_chapter_id
    sdg_df['top_n_chapter_name'] = top_n_chapter_name
    sdg_df['top_n_similarities'] = top_n_similarities
    sdg_df['top_n_description'] = top_n_description

    return sdg_df

In [129]:
def find_similarity_above_threshold(similarity_matrix, threshold, chapter_df, sdg_df):
    top_n_similarities = []
    top_n_attr = []
    top_n_chapter_id = []
    top_n_chapter_name = []
    top_n_table_id = []
    top_n_table_name = []
    top_n_count = []
    top_n_description = []
    
    for index, row in similarity_matrix.iterrows():
        # Filter indices based on threshold value
        top_n_indices = [i for i, value in enumerate(row) if value >= threshold]
        
        # Sort indices based on similarity values
        top_n_indices = sorted(top_n_indices, key=lambda i: row.iloc[i], reverse=True)
        
        # Get top n values, chapters, and chapter names
        values = [row.iloc[i] for i in top_n_indices]
        attr = [similarity_matrix.columns[i] for i in top_n_indices]
        attr_des = [chapter_df['Description'].loc[i] for i in top_n_indices]
        chapter_id = [chapter_df['Chapter_id'].loc[i] for i in top_n_indices]
        chapter_name = [chapter_df['Chapter_name'].loc[i] for i in top_n_indices]   
        table_id = [chapter_df['Table_id'].loc[i] for i in top_n_indices]
        table_name = [chapter_df['Table_name'].loc[i] for i in top_n_indices]   


        # Append to respective lists
        top_n_count.append(len(attr))
        top_n_similarities.append(values)
        top_n_attr.append(attr)
        top_n_description.append(attr_des)
        top_n_chapter_id.append(chapter_id)
        top_n_chapter_name.append(chapter_name)
        top_n_table_id.append(table_id)
        top_n_table_name.append(table_name)
    
    # Assign lists to the SDG DataFrame
    sdg_df['top_n_count'] = top_n_count
    sdg_df['top_n_attr'] = top_n_attr
    sdg_df['top_n_table_id'] = top_n_table_id
    sdg_df['top_n_table_name'] = top_n_table_name
    sdg_df['top_n_chapter_id'] = top_n_chapter_id
    sdg_df['top_n_chapter_name'] = top_n_chapter_name
    sdg_df['top_n_similarities'] = top_n_similarities
    sdg_df['top_n_description'] = top_n_description

    return sdg_df


In [130]:
def process_result(result_sdg, sdg_df):
    # Create a dictionary to store chapter details
    chapter_details = {}

    # Iterate over the rows of the SDG DataFrame
    for index, row in result_sdg.iterrows():
        attr_id = row['top_n_attr']
        chapter_id = row['top_n_chapter_id']
        table_id = row['top_n_table_id']
        values = row['top_n_similarities']
        attr_desc = row['top_n_description']
        chapter_name = row['top_n_chapter_name']
        table_name = row['top_n_table_name']
        goal_no = row['Goal No.']
        target_id = row['Target_id']
        indicator_id = row['Indicator_id']
        for ids in range(len(attr_id)):
            if attr_id[ids] not in chapter_details:
                chapter_details[attr_id[ids]] = {
                    'Attr_id' : attr_id[ids],
                    'Chapter_id': chapter_id[ids],
                    'Chapter_name': chapter_name[ids],
                    'Table_id': table_id[ids],
                    'Table_name': table_name[ids],
                    'Description': attr_desc[ids],
                    'top_n_goal_id': [goal_no],
                    'top_n_target_id': [target_id],
                    'top_n_indicator_id':[indicator_id],
                    'top_n_similarities': [values[ids]]
                }
            else:
                g_id = chapter_details[attr_id[ids]].get('top_n_goal_id')
                t_id = chapter_details[attr_id[ids]].get('top_n_target_id')
                i_id = chapter_details[attr_id[ids]].get('top_n_indicator_id')
                val = chapter_details[attr_id[ids]].get('top_n_similarities')
                g_id.append(goal_no)
                t_id.append(target_id)
                i_id.append(indicator_id)
                val.append(values[ids])
                chapter_details[attr_id[ids]].update({'top_n_goal_id': g_id})
                chapter_details[attr_id[ids]].update({'top_n_target_id': t_id})
                chapter_details[attr_id[ids]].update({'top_n_indicator_id': i_id})
                chapter_details[attr_id[ids]].update({'top_n_similarities': val})

    # Convert dictionary to list of dictionaries
    rows = []
    for a_id, values in chapter_details.items():
        row = {
            'Attr_id': a_id,
            'Chapter_id': values['Chapter_id'],
            'Chapter_name': values['Chapter_name'],
            'Table_id': values['Table_id'],
            'Table_name': values['Table_name'],
            'Description': values['Description'],
            'top_n_goal_id': values['top_n_goal_id'],
            'top_n_target_id': values['top_n_target_id'],
            'top_n_indicator_id': values['top_n_indicator_id'],
            'top_n_similarities': values['top_n_similarities']
        }
        rows.append(row)

    # Create DataFrame
    goals = pd.DataFrame(rows)
    
    # Convert dictionary to list of dictionaries
    all_rows = []
    for a_id, values in chapter_details.items():
        for i in range(len(values['top_n_indicator_id'])):
            row = {
                'Attr_id': values['Attr_id'],
                'Chapter_id': values['Chapter_id'],
                'Chapter_name': values['Chapter_name'],
                'Table_id': values['Table_id'],
                'Table_name': values['Table_name'],
                'Description': values['Description'],
                'top_n_goal_id': values['top_n_goal_id'][i],
                'top_n_target_id': values['top_n_target_id'][i],
                'top_n_indicator_id': values['top_n_indicator_id'][i],
                'top_n_similarities': values['top_n_similarities'][i]}
            all_rows.append(row)
            
    # Create DataFrame
    all_goals = pd.DataFrame(all_rows)
    goal_name = dict(zip(sdg_df['Goal No.'], sdg_df['Goal']))
    target_name = dict(zip(sdg_df['Target_id'], sdg_df['Targets']))
    indicator_name = dict(zip(sdg_df['Indicator_id'], sdg_df['Tentative Indicators']))
    all_goals['Goal'] = all_goals['top_n_goal_id'].map(goal_name.get)
    all_goals['Targets'] = all_goals['top_n_target_id'].map(target_name.get)
    all_goals['Tentative Indicators'] = all_goals['top_n_indicator_id'].map(indicator_name.get)
    all_goals = all_goals.loc[:,['Attr_id','Chapter_id', 'Chapter_name', 'Table_id', 'Table_name', 'Description', 'Goal', 'Targets', 'Tentative Indicators', 'top_n_goal_id', 'top_n_target_id', 'top_n_indicator_id', 'top_n_similarities']]
    
    return goals, all_goals


In [131]:
sdg = pd.read_csv("sdg_data_excel.csv")

In [132]:
sdg

Unnamed: 0,Goal No.,Goal,Nodal Department,Targets,Other Related Major Departments,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,Rural Development,"1.1 By 2030, eradicate extreme poverty for ...","Urban Development, Agriculture, Horticulture, ...",1.1.1 Proportion of the population below...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,Rural Development,"1.2 By 2030, reduce at least by ...","Urban Development, Agriculture, Horticulture, ...",1.2.1 Proportion of the population livin...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,Rural Development,"1.2 By 2030, reduce at least by ...","Urban Development, Agriculture, Horticulture, ...","1.2.2 Proportion of men, women and...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,Rural Development,1.3 Implement nationally appropriate so...,"Urban Development, Agriculture, Horticulture, ...",1.3.1 Percentage of the populati...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,Rural Development,"1.4 By 2030, ensure that all men and women,...","Urban Development, Agriculture, Horticulture, ...",1.4.1 Proportion of the population living in ...,1.4,1.4.1
...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,Finance,"16.4 Promote the development, tran...",Environment and Scientific Technology.,16.4.1 Total amount of approved funding t...,16.4,16.4.1
165,16,Strengthen the means of implementation and rev...,Finance,16.5 Fully operationalize the technology ...,"Environment and Scientific Technology, Inform...",16.5.1 Proportion of individuals using th...,16.5,16.5.1
166,17,"Data, monitoring and accountability",Finance,"17.1 By 2020, enhance capacity-buil...","Planning, Finance, Economic and Statistics.",17.1.1 Proportion of sustain...,17.1,17.1.1
167,17,"Data, monitoring and accountability",Finance,"17.2 By 2030, build on existing initiati...","Planning, Finance, Economic and Statistics.",17.2.1 Dollar value of all resources made avai...,17.2,17.2.1


In [133]:
sdg.columns

Index(['Goal No.', 'Goal', 'Nodal Department', 'Targets',
       'Other Related Major Departments', 'Tentative Indicators', 'Target_id',
       'Indicator_id'],
      dtype='object')

In [134]:
sdg.drop(['Nodal Department',
       'Other Related Major Departments'],axis=1,inplace=True)

In [135]:
# Remove duplicate rows
sdg = sdg.drop_duplicates()

In [136]:
sdg.reset_index(inplace = True, drop = True)

In [137]:
sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,"1.1 By 2030, eradicate extreme poverty for ...",1.1.1 Proportion of the population below...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,"1.2 By 2030, reduce at least by ...",1.2.1 Proportion of the population livin...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,"1.2 By 2030, reduce at least by ...","1.2.2 Proportion of men, women and...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,1.3 Implement nationally appropriate so...,1.3.1 Percentage of the populati...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,"1.4 By 2030, ensure that all men and women,...",1.4.1 Proportion of the population living in ...,1.4,1.4.1
...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"16.4 Promote the development, tran...",16.4.1 Total amount of approved funding t...,16.4,16.4.1
165,16,Strengthen the means of implementation and rev...,16.5 Fully operationalize the technology ...,16.5.1 Proportion of individuals using th...,16.5,16.5.1
166,17,"Data, monitoring and accountability","17.1 By 2020, enhance capacity-buil...",17.1.1 Proportion of sustain...,17.1,17.1.1
167,17,"Data, monitoring and accountability","17.2 By 2030, build on existing initiati...",17.2.1 Dollar value of all resources made avai...,17.2,17.2.1


In [138]:
# Apply the custom function to each cell in the 'Column' column
sdg['Tentative Indicators'] = sdg['Tentative Indicators'].apply(process_string)
sdg['Targets'] = sdg['Targets'].apply(process_string) 
sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1
...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1


In [139]:
sdg_new = sdg.copy()

In [140]:
#sdg_new = sdg.groupby(['Goal No.', 'Goal', 'Targets',  'Target_id'])['Tentative Indicators'].apply(lambda x: ', '.join(x)).reset_index()
#sdg_new

In [141]:
#sdg_new['concat_value'] = sdg_new['Targets'] + ' ' + sdg_new['Tentative Indicators']
#sdg_new.drop(['Tentative Indicators'], axis=1,inplace=True)

#sdg_new.rename(columns = {'concat_value':'Tentative Indicators'}, inplace = True) 
#sdg_new

In [142]:
texto_data=sdg_new['Tentative Indicators'].tolist()
process_text=cleaning(texto_data)

cleaned_sent = pd.DataFrame({'new_indicators': process_text})
sdg_new['new_indicators']=cleaned_sent

In [143]:
texto_data0=sdg_new['Tentative Indicators'].tolist()
process_text0=concat_cleaning(texto_data0)

cleaned_sent0 = pd.DataFrame({'new_indicators_str': process_text0})
sdg_new['new_indicators_str']=cleaned_sent0

In [144]:
type(sdg_new['new_indicators'].loc[0])

list

In [145]:
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"[age, urban, international, population, employ...",proportion population international poverty li...
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"[age, population, sex, living, proportion, dis...",proportion population living national poverty ...
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"[child, age, dimension, men, according, living...",proportion men woman child age living poverty ...
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"[population, disability, percentage, covered, ...",percentage population covered social protectio...
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"[population, living, access, proportion, servi...",proportion population living household access ...
...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"[approved, sound, development, dissemination, ...",total amount approved funding promote developm...
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"[internet, using, proportion, individual]",proportion individual using internet
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"[disaggregation, state, development, principle...",proportion sustainable development indicator p...
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"[available, statistical, strengthen, made, val...",dollar value resource made available strengthe...


In [146]:
data = pd.read_csv("Attributes_2016-17.csv", sep=";")

In [147]:
data.columns

Index(['Attr_id', 'Chapter_id', 'Chapter_name', 'Table_id', 'Table_name',
       'Description'],
      dtype='object')

In [148]:
#data.drop(['Attr_id', 'Table_id'],axis=1,inplace=True)

In [149]:
data

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description
0,3.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Nada Offices
1,4.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Va Circles
2,5.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Hoblies
3,6.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Grama Panchayaths
4,7.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,No.of Taluks
...,...,...,...,...,...,...
1162,1169.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Total,Number,Total Agrl. Land Holder Total"
1163,1170.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Male,Area,Total Agrl. Land Holder Total"
1164,1171.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Female,Area,Total Agrl. Land Holder Total"
1165,1172.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Institutions,Area,Total Agrl. Land Holder Total"


In [150]:
data_new = concatenate_table_name(data)
data_new

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description
0,3.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Nada Offices, Nada Offices Village Accountant ..."
1,4.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Va Circles, Nada Offices Village Accountant Ci..."
2,5.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Hoblies, Nada Offices Village Accountant Circl..."
3,6.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Grama Panchayaths, Nada Offices Village Accoun..."
4,7.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"No.of Taluks, Nada Offices Village Accountant ..."
...,...,...,...,...,...,...
1162,1169.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Total,Number,Total Agrl. Land Holder Total"
1163,1170.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Male,Area,Total Agrl. Land Holder Total"
1164,1171.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Female,Area,Total Agrl. Land Holder Total"
1165,1172.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Institutions,Area,Total Agrl. Land Holder Total"


In [151]:
#data_new['concat_value'] = data_new['Table_name'] + ' ' + data_new['Description']

# Create a new DataFrame with selected columns
new_df = data_new.copy()
new_df

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description
0,3.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Nada Offices, Nada Offices Village Accountant ..."
1,4.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Va Circles, Nada Offices Village Accountant Ci..."
2,5.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Hoblies, Nada Offices Village Accountant Circl..."
3,6.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Grama Panchayaths, Nada Offices Village Accoun..."
4,7.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"No.of Taluks, Nada Offices Village Accountant ..."
...,...,...,...,...,...,...
1162,1169.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Total,Number,Total Agrl. Land Holder Total"
1163,1170.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Male,Area,Total Agrl. Land Holder Total"
1164,1171.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Female,Area,Total Agrl. Land Holder Total"
1165,1172.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Institutions,Area,Total Agrl. Land Holder Total"


In [152]:
texto_data1=new_df['Description'].tolist()
process_text1=cleaning(texto_data1)

cleaned_sent1 = pd.DataFrame({'new_description': process_text1})
new_df['new_description']=cleaned_sent1

In [153]:
texto_data11=new_df['Description'].tolist()
process_text11=concat_cleaning(texto_data11)

cleaned_sent11 = pd.DataFrame({'new_description_str': process_text11})
new_df['new_description_str']=cleaned_sent11

In [154]:
new_df

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,new_description,new_description_str
0,3.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Nada Offices, Nada Offices Village Accountant ...","[circle, hoblies, taluks, grama, accountant, p...",nada office nada office village accountant cir...
1,4.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Va Circles, Nada Offices Village Accountant Ci...","[circle, va, hoblies, taluks, grama, accountan...",va circle nada office village accountant circl...
2,5.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Hoblies, Nada Offices Village Accountant Circl...","[circle, hoblies, taluks, grama, accountant, p...",hoblies nada office village accountant circle ...
3,6.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Grama Panchayaths, Nada Offices Village Accoun...","[circle, hoblies, taluks, grama, accountant, p...",grama panchayaths nada office village accounta...
4,7.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"No.of Taluks, Nada Offices Village Accountant ...","[circle, hoblies, taluks, grama, accountant, p...",taluks nada office village accountant circle h...
...,...,...,...,...,...,...,...,...
1162,1169.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Total,Number,Total Agrl. Land Holder Total","[total, land, agrl, holder, number]",total number total agrl land holder total
1163,1170.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Male,Area,Total Agrl. Land Holder Total","[area, total, land, agrl, holder, male]",male area total agrl land holder total
1164,1171.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Female,Area,Total Agrl. Land Holder Total","[area, female, total, land, agrl, holder]",female area total agrl land holder total
1165,1172.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Institutions,Area,Total Agrl. Land Holder Total","[area, total, institution, land, agrl, holder]",institution area total agrl land holder total


In [155]:
new_df.loc[55]

Attr_id                                                             58.0
Chapter_id                                                             2
Chapter_name                                         Area and Population
Table_id                                                             2.1
Table_name             Population and percentage share to total Popul...
Description            Geographical Area Sq.Kms, Population and perce...
new_description        [area, population, sq, total, km, percentage, ...
new_description_str    geographical area sq km population percentage ...
Name: 55, dtype: object

In [156]:
cleaned_sent

Unnamed: 0,new_indicators
0,"[age, urban, international, population, employ..."
1,"[age, population, sex, living, proportion, dis..."
2,"[child, age, dimension, men, according, living..."
3,"[population, disability, percentage, covered, ..."
4,"[population, living, access, proportion, servi..."
...,...
164,"[approved, sound, development, dissemination, ..."
165,"[internet, using, proportion, individual]"
166,"[disaggregation, state, development, principle..."
167,"[available, statistical, strengthen, made, val..."


In [157]:
common_word_matrix = sdg_new['new_indicators'].apply(count_common_words, args=(new_df,))
common_word_matrix

0      [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...
1      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                             ...                        
164    [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...
165    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
166    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
167    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
168    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: new_indicators, Length: 169, dtype: object

In [158]:
type(common_word_matrix)

pandas.core.series.Series

In [159]:
attr_id = new_df['Attr_id'].tolist()
ind_id = sdg_new['Indicator_id'].tolist()

In [160]:
# Determine the maximum number of words in any list
max_words = max(common_word_matrix, key=lambda x: len(x))

# Create DataFrame with columns Word_1, Word_2, ..., Word_n
common_word_df = pd.DataFrame(common_word_matrix.tolist(), columns=attr_id, index=ind_id)

In [161]:
common_word_df

Unnamed: 0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,1164.0,1165.0,1166.0,1167.0,1168.0,1169.0,1170.0,1171.0,1172.0,1173.0
1.1.1,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1.2.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.2.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.3.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.4.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16.4.1,0,0,0,0,0,1,0,0,1,0,...,1,1,1,1,1,1,1,1,1,1
16.5.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17.1.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17.2.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
# Example usage:
threshold_value = 1  # Define the threshold value

# Call the function to find chapter_ids with similarity greater than the threshold
result_sdg = find_similarity_above_threshold(common_word_df, threshold_value, new_df, sdg_new)


In [163]:
result_sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"[age, urban, international, population, employ...",proportion population international poverty li...,140,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[2.1, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, ...",[Population and percentage share to total Popu...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[Area and Population, Area and Population, Are...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[Geographical Area Sq.Kms, Population and perc..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"[age, population, sex, living, proportion, dis...",proportion population living national poverty ...,70,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 17.2, 2.1...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 17, 2, 2, 2, 2, 2, 2,...","[Education, Education, Education, Education, E...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Boys,No.of Children Enrolment age group betwe..."
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"[child, age, dimension, men, according, living...",proportion men woman child age living poverty ...,41,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 17.2, 2.4...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 17, 2, 2, 2, 2, 2, 2,...","[Education, Education, Education, Education, E...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Boys,No.of Children Enrolment age group betwe..."
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"[population, disability, percentage, covered, ...",percentage population covered social protectio...,84,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[17.2, 17.2, 2.1, 2.1, 2.1, 2.4, 2.4, 2.4, 2.4...",[No.of Pensioners under Social Schemes As on 3...,"[17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...","[Miscellaneous, Miscellaneous, Area and Popula...","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[Old Age Pensioners, No.of Pensioners under So..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"[population, living, access, proportion, servi...",proportion population living household access ...,62,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, ...",[Population and percentage share to total Popu...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[Area and Population, Area and Population, Are...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Geographical Area Sq.Kms, Population and perc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"[approved, sound, development, dissemination, ...",total amount approved funding promote developm...,303,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[1.2, 1.2, 1.3, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, ...",[No.of Taluks Inhabited Un-inhabited and Total...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...","[General Information, General Information, Gen...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Cities Towns Urban, No.of Taluks Inhabited Un..."
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"[internet, using, proportion, individual]",proportion individual using internet,2,"[533.0, 872.0]","[9.4, 14.2]",[Post Office Telephones Exchanges Telephones. ...,"[9, 14]","[TRANSPORT AND COMMUNICATION, Rural Developmen...","[1, 1]","[Internet Connections,Communication In Numbers..."
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"[disaggregation, state, development, principle...",proportion sustainable development indicator p...,8,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[4.18, 5.4, 6.5, 9.2, 14.2, 14.2, 14.2, 14.2]","[Sericulture Year 2016-17 in Nos., Fisheries Y...","[4, 5, 6, 9, 14, 14, 14, 14]","[Agriculture, Horticulture and Sericulture, An...","[1, 1, 1, 1, 1, 1, 1, 1]","[Value of Silk Produced Rs. in lakhs, Sericult..."
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"[available, statistical, strengthen, made, val...",dollar value resource made available strengthe...,4,"[27.0, 339.0, 384.0, 386.0]","[1.5, 4.18, 5.4, 5.4]","[No.of Fire Stations on In Nos, Sericulture Ye...","[1, 4, 5, 5]","[General Information, Agriculture, Horticultur...","[1, 1, 1, 1]","[Value of the property protected Rs.in Crores,..."


In [164]:
goals, all_goals = process_result(result_sdg, sdg_new)

In [165]:
all_goals

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id,top_n_similarities
0,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1,2
1,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1,1.2,1.2.1,1
2,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1,1.3,1.3.1,2
3,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1,1.4,1.4.1,1
4,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...","End hunger, achieve food security and improved...","By 2030, end hunger and ensure access by all p...",Prevalence of moderate or severe food insecuri...,2,2.1,\n2.1.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19236,21.0,1,General Information,1.4,Cinema Theatres Police Station Prisons and Pri...,"Prisons,Prisons and Prisoners 31-3-2017",Promote peaceful and inclusive societies for s...,Promote the rule of law and ensure equal acces...,Unsentenced detainees as a percentage of overa...,15,15.3,15.3.2,1
19237,660.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"Govt.,No.of Hospitals",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,1
19238,663.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of Govt. Doctors, No.of Hospitals Doctors B...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,1
19239,664.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of beds in Govt. Hospitals, No.of Hospitals...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,1


In [166]:
goals

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,top_n_goal_id,top_n_target_id,top_n_indicator_id,top_n_similarities
0,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 2.3, 2.4, 2.4, 2.5, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 2.3.2, 2...","[2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,65.0,2,Area and Population,2.2,Rural and Urban Population and Decadal change ...,"Rural,Population 2011","[1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 5, 6, 6, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 3.3, 3.3, 3.5, 3.7, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 3.3.1, 3...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,66.0,2,Area and Population,2.2,Rural and Urban Population and Decadal change ...,"Urban,Population 2011","[1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 5, 6, 6, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 3.3, 3.3, 3.5, 3.7, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 3.3.1, 3...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,71.0,2,Area and Population,2.3,Ratio No.of Females per 1000 Males 2001- 2011,"Rural,Sex Ratio 2001","[1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, ...","[1.1, 1.2, 1.3, 3.3, 4.1, 4.2, 4.5, 4.6, 4.7, ...","[1.1.1, 1.2.1, 1.3.1, 3.3.1, 4.1.1, 4.2.1, 4.5...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,72.0,2,Area and Population,2.3,Ratio No.of Females per 1000 Males 2001- 2011,"Urban,Sex Ratio 2001","[1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, ...","[1.1, 1.2, 1.3, 3.3, 4.1, 4.2, 4.5, 4.6, 4.7, ...","[1.1.1, 1.2.1, 1.3.1, 3.3.1, 4.1.1, 4.2.1, 4.5...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...
1068,21.0,1,General Information,1.4,Cinema Theatres Police Station Prisons and Pri...,"Prisons,Prisons and Prisoners 31-3-2017",[15],[15.3],[15.3.2],[1]
1069,660.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"Govt.,No.of Hospitals",[15],[15.9],[15.9.2],[1]
1070,663.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of Govt. Doctors, No.of Hospitals Doctors B...",[15],[15.9],[15.9.2],[1]
1071,664.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of beds in Govt. Hospitals, No.of Hospitals...",[15],[15.9],[15.9.2],[1]


In [167]:
result_sdg.to_csv("Att_Ind_String.csv", sep=";", index=False)

In [168]:
all_goals.to_csv("Att_Ind_String_goals.csv", sep=";", index=False)

In [169]:
cleaned_sent0['new_indicators_str'].tail()

164    total amount approved funding promote developm...
165                 proportion individual using internet
166    proportion sustainable development indicator p...
167    dollar value resource made available strengthe...
168                               inclusive wealth index
Name: new_indicators_str, dtype: object

In [170]:
cleaned_sent11['new_description_str'].head()

0    nada office nada office village accountant cir...
1    va circle nada office village accountant circl...
2    hoblies nada office village accountant circle ...
3    grama panchayaths nada office village accounta...
4    taluks nada office village accountant circle h...
Name: new_description_str, dtype: object

In [171]:
all_text = pd.DataFrame(pd.concat([cleaned_sent0['new_indicators_str'], cleaned_sent11['new_description_str']]))

In [172]:
all_text = all_text.reset_index(drop=True)

In [173]:
all_text

Unnamed: 0,0
0,proportion population international poverty li...
1,proportion population living national poverty ...
2,proportion men woman child age living poverty ...
3,percentage population covered social protectio...
4,proportion population living household access ...
...,...
1331,total number total agrl land holder total
1332,male area total agrl land holder total
1333,female area total agrl land holder total
1334,institution area total agrl land holder total


In [174]:
# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data from df_1 and df_2
tfidf_matrix_1 = tfidf_vectorizer.fit_transform(all_text[0])
tfidf_matrix_2 = tfidf_vectorizer.transform(new_df['new_description_str'])
print(tfidf_matrix_1.shape)
print(tfidf_matrix_2.shape)


(1336, 1220)
(1167, 1220)


In [175]:
tfidf_matrix_0 = tfidf_matrix_1[0:169, :] 
tfidf_matrix_0.shape

(169, 1220)

In [176]:
# Compute cosine similarity between vectors of df_1 and df_2
similarity_matrix_sdg = cosine_similarity(tfidf_matrix_0, tfidf_matrix_2 )

vec_val = new_df['Attr_id'].tolist()
sdg_val = sdg_new['Indicator_id'].tolist()

# Convert cosine similarities to DataFrame
similarity_df_sdg = pd.DataFrame(similarity_matrix_sdg, columns=vec_val, index=sdg_val)


In [177]:
similarity_df_sdg

Unnamed: 0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,1164.0,1165.0,1166.0,1167.0,1168.0,1169.0,1170.0,1171.0,1172.0,1173.0
1.1.1,0.0,0.0,0.0,0.0,0.0,0.041324,0.0,0.0,0.000000,0.073461,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1.2.1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1.2.2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1.3.1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1.4.1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16.4.1,0.0,0.0,0.0,0.0,0.0,0.011898,0.0,0.0,0.023902,0.000000,...,0.02849,0.056517,0.056933,0.056966,0.056212,0.078284,0.056988,0.057021,0.056265,0.078347
16.5.1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
17.1.1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
17.2.1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [178]:
# Example usage:
threshold = 10  # Define the threshold value

# Call the function to find chapter_ids with similarity greater than the threshold
result_sdg_sim = find_similarity_sdg(similarity_df_sdg, threshold, new_df, sdg_new)


In [179]:
result_sdg_sim

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"[age, urban, international, population, employ...",proportion population international poverty li...,10,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[2.2, 2.2, 2.3, 2.3, 2.1, 4.1, 2.3, 2.3, 10.8,...",[Rural and Urban Population and Decadal change...,"[2, 2, 2, 2, 2, 4, 2, 2, 10, 2]","[Area and Population, Area and Population, Are...","[0.2747968656671603, 0.268150135206932, 0.2240...","[Urban,Population 2011, Rural,Population 2011,..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"[age, population, sex, living, proportion, dis...",proportion population living national poverty ...,10,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 2.1, 2.2,...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 2, 2, 2, 2]","[Education, Education, Education, Education, E...","[0.26541303169689845, 0.24930426228926683, 0.2...","[Total,No.of Children out of school age group ..."
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"[child, age, dimension, men, according, living...",proportion men woman child age living poverty ...,10,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[10.8, 10.11, 10.8, 10.8, 10.8, 10.8, 10.8, 14...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 10, 14, 10, 2]","[Education, Education, Education, Education, E...","[0.22885060867758872, 0.22697420351274686, 0.2...","[Total,No.of Children out of school age group ..."
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"[population, disability, percentage, covered, ...",percentage population covered social protectio...,10,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[17.2, 13.5, 2.4, 17.2, 2.4, 2.4, 10.8, 2.5, 2...",[No.of Pensioners under Social Schemes As on 3...,"[17, 13, 2, 17, 2, 2, 10, 2, 2, 10]","[Miscellaneous, Women & Child Development, Are...","[0.1807125515613829, 0.16616236668605194, 0.16...","[Old Age Pensioners, No.of Pensioners under So..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"[population, living, access, proportion, servi...",proportion population living household access ...,10,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[6.4, 2.1, 2.2, 15.2, 15.3, 6.4, 2.1, 2.1, 9.1...",[Small Scale units registered in District Indu...,"[6, 2, 2, 15, 15, 6, 2, 2, 9, 2]","[Industries, Area and Population, Area and Pop...","[0.24260399548371742, 0.23157152877495513, 0.2...","[No.,Other Service Activities, Total,Populatio..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"[approved, sound, development, dissemination, ...",total amount approved funding promote developm...,10,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[2.11, 2.11, 2.11, 6.4, 18.1, 18.1, 6.4, 11.1,...",[Districtwise Population By Age Groups And Sex...,"[2, 2, 2, 6, 18, 18, 6, 11, 14, 14]","[Area and Population, Area and Population, Are...","[0.10471104174147035, 0.10471104174147035, 0.1...","[Total,0-14, Total,15-59, Total,60, No.,Total,..."
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"[internet, using, proportion, individual]",proportion individual using internet,10,"[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[9.4, 14.2, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2,...",[Post Office Telephones Exchanges Telephones. ...,"[9, 14, 1, 1, 1, 1, 1, 1, 1, 1]","[TRANSPORT AND COMMUNICATION, Rural Developmen...","[0.2825673271164115, 0.17691126490402242, 0.0,...","[Internet Connections,Communication In Numbers..."
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"[disaggregation, state, development, principle...",proportion sustainable development indicator p...,10,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[4.18, 5.4, 9.2, 6.5, 14.2, 14.2, 14.2, 14.2, ...","[Sericulture Year 2016-17 in Nos., Fisheries Y...","[4, 5, 9, 6, 14, 14, 14, 14, 1, 1]","[Agriculture, Horticulture and Sericulture, An...","[0.11681628897132912, 0.11403210208777811, 0.0...","[Value of Silk Produced Rs. in lakhs, Sericult..."
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"[available, statistical, strengthen, made, val...",dollar value resource made available strengthe...,10,"[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[5.4, 5.4, 4.18, 1.5, 1.1, 1.1, 1.1, 1.1, 1.1,...","[Fisheries Year 2016-17, Fisheries Year 2016-1...","[5, 5, 4, 1, 1, 1, 1, 1, 1, 1]","[Animal Husbandry, Animal Husbandry, Agricultu...","[0.1787601212421857, 0.1635585802785323, 0.122...","[Capacity Tonnes,Cold storages, Capacity Tonne..."


In [180]:
goals_sim, all_goals_sim = process_result(result_sdg_sim, sdg_new)

In [181]:
all_goals_sim

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id,top_n_similarities
0,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1,0.274797
1,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",Ensure healthy lives and promote well-being fo...,"Achieve universal health coverage, including f...",Fraction of the population protected against c...,3,3.7,\n3.7.2,0.143814
2,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",Ensure healthy lives and promote well-being fo...,Support the research and development of vaccin...,Proportion of the population with access to af...,3,3.1,3.10.1,0.156146
3,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",Reduce inequality within the State,"By 2030, progressively achieve and sustain inc...",rates of household expenditure or income per c...,10,10.1,10.1.1Growth,0.252191
4,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011","Make cities and human settlements inclusive, s...",Support through financial and technical assist...,Proportion of urban population living in slums...,11,11.1,11.1.1,0.310965
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1685,611.0,10,Education,10.11,No.of P U Colleges Students and Lecturers Year...,"Girls,Total Govt Private,No.Of Students in col...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,0.180987
1686,610.0,10,Education,10.11,No.of P U Colleges Students and Lecturers Year...,"Boys,Total Govt Private,No.Of Students in coll...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,0.180732
1687,187.0,4,"Agriculture, Horticulture and Sericulture",4.20,Gross and Net area Irrigated under Different S...,"Net Area Irrigated,Other Sources,area Irrigate...",Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",Total government revenue (by source) as a perc...,16,16.1,16.1.1,0.225162
1688,186.0,4,"Agriculture, Horticulture and Sericulture",4.20,Gross and Net area Irrigated under Different S...,"Gross Irrigated Area,Other Sources,area Irriga...",Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",Total government revenue (by source) as a perc...,16,16.1,16.1.1,0.222717


In [182]:
goals_sim

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,top_n_goal_id,top_n_target_id,top_n_indicator_id,top_n_similarities
0,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011","[1, 3, 3, 10, 11, 11, 11, 11, 15]","[1.1, 3.7, 3.1, 10.1, 11.1, 11.3, 11.6, 11.8, ...","[1.1.1, \n3.7.2, 3.10.1, 10.1.1Growth, 11.1.1,...","[0.2747968656671603, 0.14381422312095726, 0.15..."
1,65.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Rural,Population 2011","[1, 1, 2, 3, 3, 3, 9, 9, 10, 15, 15]","[1.1, 1.4, 2.1, 3.3, 3.7, 3.1, 9.1, 9.8, 10.1,...","[1.1.1, 1.4.1, \n2.1.2, 3.3.1, \n3.7.2, 3.10.1...","[0.268150135206932, 0.18766250598087164, 0.101..."
2,72.0,2,Area and Population,2.30,Ratio No.of Females per 1000 Males 2001- 2011,"Urban,Sex Ratio 2001","[1, 5, 5, 7, 11]","[1.1, 5.1, 5.8, 7.5, 11.3]","[1.1.1, 5.1.1, 5.8.1, 7.5.1, 11.3.1]","[0.2240861593673777, 0.10624658187699199, 0.18..."
3,75.0,2,Area and Population,2.30,Ratio No.of Females per 1000 Males 2001- 2011,"Urban,Sex Ratio 2011","[1, 5, 5, 7, 11]","[1.1, 5.1, 5.8, 7.5, 11.3]","[1.1.1, 5.1.1, 5.8.1, 7.5.1, 11.3.1]","[0.2240861593673777, 0.10624658187699199, 0.18..."
4,58.0,2,Area and Population,2.10,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...","[1, 2, 3, 5, 6, 8, 9, 9, 9, 9, 10, 10, 11, 11,...","[1.1, 2.4, 3.12, 5.7, 6.2, 8.6, 9.1, 9.3, 9.5,...","[1.1.1, 2.4.1, 3.12.1, 5.7.1, 6.2.1, 8.6.1, 9....","[0.22379074708357471, 0.10189694619073142, 0.0..."
...,...,...,...,...,...,...,...,...,...,...
455,611.0,10,Education,10.11,No.of P U Colleges Students and Lecturers Year...,"Girls,Total Govt Private,No.Of Students in col...",[15],[15.9],[15.9.2],[0.18098725989659864]
456,610.0,10,Education,10.11,No.of P U Colleges Students and Lecturers Year...,"Boys,Total Govt Private,No.Of Students in coll...",[15],[15.9],[15.9.2],[0.18073249276967818]
457,187.0,4,"Agriculture, Horticulture and Sericulture",4.20,Gross and Net area Irrigated under Different S...,"Net Area Irrigated,Other Sources,area Irrigate...",[16],[16.1],[16.1.1],[0.22516183912896037]
458,186.0,4,"Agriculture, Horticulture and Sericulture",4.20,Gross and Net area Irrigated under Different S...,"Gross Irrigated Area,Other Sources,area Irriga...",[16],[16.1],[16.1.1],[0.2227174507914614]


In [183]:
result_sdg_sim.to_csv("Att_Ind_Sim.csv", sep=";", index=False)

In [184]:
all_goals_sim.to_csv("Att_Ind_Sim_goals.csv", sep=";", index=False)

In [185]:
# Example usage:
threshold_val = 0.05  # Define the threshold value

# Call the function to find chapter_ids with similarity greater than the threshold
result_sdg_threshold = find_similarity_above_threshold(similarity_df_sdg, threshold_val, new_df, sdg_new)


In [186]:
result_sdg_threshold

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"[age, urban, international, population, employ...",proportion population international poverty li...,128,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[2.2, 2.2, 2.3, 2.3, 2.1, 4.1, 2.3, 2.3, 10.8,...",[Rural and Urban Population and Decadal change...,"[2, 2, 2, 2, 2, 4, 2, 2, 10, 2, 10, 10, 10, 2,...","[Area and Population, Area and Population, Are...","[0.2747968656671603, 0.268150135206932, 0.2240...","[Urban,Population 2011, Rural,Population 2011,..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"[age, population, sex, living, proportion, dis...",proportion population living national poverty ...,70,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 2.1, 2.2,...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 2, 2, 2, 2, 2, 2, 2, ...","[Education, Education, Education, Education, E...","[0.26541303169689845, 0.24930426228926683, 0.2...","[Total,No.of Children out of school age group ..."
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"[child, age, dimension, men, according, living...",proportion men woman child age living poverty ...,41,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[10.8, 10.11, 10.8, 10.8, 10.8, 10.8, 10.8, 14...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 10, 14, 10, 2, 2, 2, ...","[Education, Education, Education, Education, E...","[0.22885060867758872, 0.22697420351274686, 0.2...","[Total,No.of Children out of school age group ..."
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"[population, disability, percentage, covered, ...",percentage population covered social protectio...,78,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[17.2, 13.5, 2.4, 17.2, 2.4, 2.4, 10.8, 2.5, 2...",[No.of Pensioners under Social Schemes As on 3...,"[17, 13, 2, 17, 2, 2, 10, 2, 2, 10, 10, 2, 2, ...","[Miscellaneous, Women & Child Development, Are...","[0.1807125515613829, 0.16616236668605194, 0.16...","[Old Age Pensioners, No.of Pensioners under So..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"[population, living, access, proportion, servi...",proportion population living household access ...,62,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[6.4, 2.1, 2.2, 15.2, 15.3, 6.4, 2.1, 2.1, 9.1...",[Small Scale units registered in District Indu...,"[6, 2, 2, 15, 15, 6, 2, 2, 9, 2, 15, 2, 2, 9, ...","[Industries, Area and Population, Area and Pop...","[0.24260399548371742, 0.23157152877495513, 0.2...","[No.,Other Service Activities, Total,Populatio..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"[approved, sound, development, dissemination, ...",total amount approved funding promote developm...,33,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[2.11, 2.11, 2.11, 6.4, 18.1, 18.1, 6.4, 11.1,...",[Districtwise Population By Age Groups And Sex...,"[2, 2, 2, 6, 18, 18, 6, 11, 14, 14, 14, 14, 18...","[Area and Population, Area and Population, Are...","[0.10471104174147035, 0.10471104174147035, 0.1...","[Total,0-14, Total,15-59, Total,60, No.,Total,..."
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"[internet, using, proportion, individual]",proportion individual using internet,2,"[533.0, 872.0]","[9.4, 14.2]",[Post Office Telephones Exchanges Telephones. ...,"[9, 14]","[TRANSPORT AND COMMUNICATION, Rural Developmen...","[0.2825673271164115, 0.17691126490402242]","[Internet Connections,Communication In Numbers..."
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"[disaggregation, state, development, principle...",proportion sustainable development indicator p...,8,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[4.18, 5.4, 9.2, 6.5, 14.2, 14.2, 14.2, 14.2]","[Sericulture Year 2016-17 in Nos., Fisheries Y...","[4, 5, 9, 6, 14, 14, 14, 14]","[Agriculture, Horticulture and Sericulture, An...","[0.11681628897132912, 0.11403210208777811, 0.0...","[Value of Silk Produced Rs. in lakhs, Sericult..."
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"[available, statistical, strengthen, made, val...",dollar value resource made available strengthe...,4,"[386.0, 384.0, 339.0, 27.0]","[5.4, 5.4, 4.18, 1.5]","[Fisheries Year 2016-17, Fisheries Year 2016-1...","[5, 5, 4, 1]","[Animal Husbandry, Animal Husbandry, Agricultu...","[0.1787601212421857, 0.1635585802785323, 0.122...","[Capacity Tonnes,Cold storages, Capacity Tonne..."


In [187]:
goals_threshold, all_goals_threshold = process_result(result_sdg_threshold, sdg_new)

In [188]:
all_goals_threshold

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id,top_n_similarities
0,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1,0.274797
1,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1,1.2,1.2.1,0.157674
2,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1,1.3,1.3.1,0.084333
3,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011",End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1,1.4,1.4.1,0.183123
4,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011","End hunger, achieve food security and improved...","By 2030, end hunger and ensure access by all p...",Prevalence of moderate or severe food insecuri...,2,2.1,\n2.1.2,0.099181
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10316,660.0,11,Health & Family welfare Services,11.20,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"Govt.,No.of Hospitals",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,0.299276
10317,610.0,10,Education,10.11,No.of P U Colleges Students and Lecturers Year...,"Boys,Total Govt Private,No.Of Students in coll...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,0.180732
10318,663.0,11,Health & Family welfare Services,11.20,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of Govt. Doctors, No.of Hospitals Doctors B...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,0.139811
10319,664.0,11,Health & Family welfare Services,11.20,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of beds in Govt. Hospitals, No.of Hospitals...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2,0.131506


In [189]:
goals_threshold

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,top_n_goal_id,top_n_target_id,top_n_indicator_id,top_n_similarities
0,66.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Urban,Population 2011","[1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 5, 6, 6, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 3.3, 3.3, 3.5, 3.7, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 3.3.1, 3...","[0.2747968656671603, 0.15767429911675504, 0.08..."
1,65.0,2,Area and Population,2.20,Rural and Urban Population and Decadal change ...,"Rural,Population 2011","[1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 5, 6, 6, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 3.3, 3.3, 3.5, 3.7, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 3.3.1, 3...","[0.268150135206932, 0.1615826266882691, 0.0864..."
2,72.0,2,Area and Population,2.30,Ratio No.of Females per 1000 Males 2001- 2011,"Urban,Sex Ratio 2001","[1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, ...","[1.1, 1.2, 1.3, 3.3, 4.1, 4.2, 4.5, 4.6, 4.7, ...","[1.1.1, 1.2.1, 1.3.1, 3.3.1, 4.1.1, 4.2.1, 4.5...","[0.2240861593673777, 0.14589300298194208, 0.07..."
3,75.0,2,Area and Population,2.30,Ratio No.of Females per 1000 Males 2001- 2011,"Urban,Sex Ratio 2011","[1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, ...","[1.1, 1.2, 1.3, 3.3, 4.1, 4.2, 4.5, 4.6, 4.7, ...","[1.1.1, 1.2.1, 1.3.1, 3.3.1, 4.1.1, 4.2.1, 4.5...","[0.2240861593673777, 0.14589300298194208, 0.07..."
4,58.0,2,Area and Population,2.10,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...","[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 2.4, 2.4, 2.7, 3.3, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 2.4.1, \...","[0.22379074708357471, 0.11497480818128972, 0.0..."
...,...,...,...,...,...,...,...,...,...,...
1023,660.0,11,Health & Family welfare Services,11.20,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"Govt.,No.of Hospitals",[15],[15.9],[15.9.2],[0.2992760597259916]
1024,610.0,10,Education,10.11,No.of P U Colleges Students and Lecturers Year...,"Boys,Total Govt Private,No.Of Students in coll...",[15],[15.9],[15.9.2],[0.18073249276967818]
1025,663.0,11,Health & Family welfare Services,11.20,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of Govt. Doctors, No.of Hospitals Doctors B...",[15],[15.9],[15.9.2],[0.13981106672614296]
1026,664.0,11,Health & Family welfare Services,11.20,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of beds in Govt. Hospitals, No.of Hospitals...",[15],[15.9],[15.9.2],[0.13150646288571774]


In [190]:
result_sdg_threshold.to_csv("Att_Ind_Threshold.csv", sep=";", index=False)

In [191]:
all_goals_threshold.to_csv("Att_Ind_Threshold_goals.csv", sep=";", index=False)

In [65]:
result_sdg_new =result_sdg[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]
result_sdg_sim_new =result_sdg_sim[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]
result_sdg_threshold_new = result_sdg_threshold[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]

In [66]:
# Merge the dataframes based on columns 'Goal No.', 'Target_id', and 'Indicator_id'
merged_df = pd.merge(result_sdg_threshold_new,result_sdg_sim_new, on=['Goal No.', 'Target_id', 'Indicator_id'], suffixes=('_B', '_C'))

# Function to combine and remove duplicates from lists while maintaining order
def combine_lists(row):
    list_B = row['top_n_attr_B']
    list_C = row['top_n_attr_C']
    
    # Combine lists and remove duplicates while maintaining order
    combined_top_n_attr = sorted(set(list_B + list_C), key=lambda x: (list_B + list_C).index(x))
    
    return combined_top_n_attr

# Apply the function to the merged dataframe
merged_df['BC'] = merged_df.apply(combine_lists, axis=1)

In [67]:
merged_df

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_B,top_n_attr_C,BC
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."
...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875..."
167,17,17.2,17.2.1,"[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]"


In [68]:
result_sdg_new['top_n_attr_B'] = result_sdg_threshold['top_n_attr']
result_sdg_new['top_n_attr_C'] = result_sdg_sim['top_n_attr']
result_sdg_new['BC'] = merged_df['BC']
result_sdg_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['top_n_attr_B'] = result_sdg_threshold['top_n_attr']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['top_n_attr_c'] = result_sdg_sim['top_n_attr']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['BC'] = merged_df['BC']


Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr,top_n_attr_B,top_n_attr_c,BC
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."
...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875..."
167,17,17.2,17.2.1,"[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]"


In [69]:
# Function to find intersection while maintaining order
def find_intersection(row):
    intersection = [x for x in row['top_n_attr'] if x in row['BC']]
    return intersection


# Apply the function to the merged dataframe
result_sdg_new['A_and_BC'] = result_sdg_new.apply(find_intersection, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['A_and_BC'] = result_sdg_new.apply(find_intersection, axis=1)


In [70]:
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr,top_n_attr_B,top_n_attr_c,BC,A_and_BC
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."
...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875..."
167,17,17.2,17.2.1,"[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]"


In [71]:
def find_difference(row):
    top_n_attr = row['top_n_attr']
    intersection = row['A_and_BC']
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference
# Apply the function to the merged dataframe
result_sdg_new['A'] = result_sdg_new.apply(find_difference, axis=1)
result_sdg_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['A'] = result_sdg_new.apply(find_difference, axis=1)


Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr,top_n_attr_B,top_n_attr_c,BC,A_and_BC,A
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[]
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[]
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[]
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[]
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[]
...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[]
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]",[]
166,17,17.1,17.1.1,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[]
167,17,17.2,17.2.1,"[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]",[]


In [72]:
# Function to find intersection while maintaining order
def find_intersection_BC(row):
    intersection = [x for x in row['top_n_attr_B'] if x in row['top_n_attr_c']]
    return intersection


# Apply the function to the merged dataframe
result_sdg_new['B_and_C'] = result_sdg_new.apply(find_intersection_BC, axis=1)
result_sdg_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['B_and_C'] = result_sdg_new.apply(find_intersection_BC, axis=1)


Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr,top_n_attr_B,top_n_attr_c,BC,A_and_BC,A,B_and_C
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[],"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[],"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[],"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[],"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[],"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."
...,...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[],"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]",[],"[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[],"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875..."
167,17,17.2,17.2.1,"[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]",[],"[386.0, 384.0, 339.0, 27.0]"


In [73]:
def find_difference_B(row):
    top_n_attr = row['top_n_attr_B']
    intersection = set(row['A'] + row['BC'] + row['B_and_C'] + row['A_and_BC'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference
# Apply the function to the merged dataframe
result_sdg_new['B_minus_all'] = result_sdg_new.apply(find_difference_B, axis=1)
result_sdg_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['B_minus_all'] = result_sdg_new.apply(find_difference_B, axis=1)


Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr,top_n_attr_B,top_n_attr_c,BC,A_and_BC,A,B_and_C,B_minus_all
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[],"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[]
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[],"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[]
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[],"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[]
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[],"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[]
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[],"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[]
...,...,...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[],"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[]
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]",[],"[533.0, 872.0]",[]
166,17,17.1,17.1.1,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[],"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[]
167,17,17.2,17.2.1,"[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]",[],"[386.0, 384.0, 339.0, 27.0]",[]


In [74]:
def find_difference_C(row):
    top_n_attr = row['top_n_attr_c']
    intersection = set(row['A'] + row['BC'] + row['B_and_C'] + row['A_and_BC'] + row['B_minus_all'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference
# Apply the function to the merged dataframe
result_sdg_new['C_minus_all'] = result_sdg_new.apply(find_difference_C, axis=1)
result_sdg_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_sdg_new['C_minus_all'] = result_sdg_new.apply(find_difference_C, axis=1)


Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr,top_n_attr_B,top_n_attr_c,BC,A_and_BC,A,B_and_C,B_minus_all,C_minus_all
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[],"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[],[]
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[],"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[],[]
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[],"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[],[]
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[],"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[],[]
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[],"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[],"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[],[]
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0]",[],"[533.0, 872.0]",[],[]
166,17,17.1,17.1.1,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[],"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[],[]
167,17,17.2,17.2.1,"[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0]",[],"[386.0, 384.0, 339.0, 27.0]",[],[]


In [94]:
sim_threshold = pd.concat([result_sdg_sim,result_sdg_threshold])
sim_threshold

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"[age, geographical, proportion, employment, po...",proportion population international poverty li...,128,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[2.2, 2.2, 2.3, 2.3, 2.1, 4.1, 2.3, 2.3, 10.8,...",[Rural and Urban Population and Decadal change...,"[2, 2, 2, 2, 2, 4, 2, 2, 10, 2, 10, 10, 10, 2,...","[Area and Population, Area and Population, Are...","[0.2747968656671603, 0.268150135206932, 0.2240...","[Urban,Population 2011, Rural,Population 2011,..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"[age, national, proportion, population, povert...",proportion population living national poverty ...,70,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 2.1, 2.2,...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 2, 2, 2, 2, 2, 2, 2, ...","[Education, Education, Education, Education, E...","[0.26541303169689845, 0.24930426228926683, 0.2...","[Total,No.of Children out of school age group ..."
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"[age, living, dimension, proportion, national,...",proportion men woman child age living poverty ...,41,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[10.8, 10.11, 10.8, 10.8, 10.8, 10.8, 10.8, 14...",[No .of Children in the age group 6 to 14 year...,"[10, 10, 10, 10, 10, 10, 10, 14, 10, 2, 2, 2, ...","[Education, Education, Education, Education, E...","[0.22885060867758872, 0.22697420351274686, 0.2...","[Total,No.of Children out of school age group ..."
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"[child, floor, social, system, distinguishing,...",percentage population covered social protectio...,78,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[17.2, 13.5, 2.4, 17.2, 2.4, 2.4, 10.8, 2.5, 2...",[No.of Pensioners under Social Schemes As on 3...,"[17, 13, 2, 17, 2, 2, 10, 2, 2, 10, 10, 2, 2, ...","[Miscellaneous, Women & Child Development, Are...","[0.1807125515613829, 0.16616236668605194, 0.16...","[Old Age Pensioners, No.of Pensioners under So..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"[proportion, population, household, service, a...",proportion population living household access ...,62,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[6.4, 2.1, 2.2, 15.2, 15.3, 6.4, 2.1, 2.1, 9.1...",[Small Scale units registered in District Indu...,"[6, 2, 2, 15, 15, 6, 2, 2, 9, 2, 15, 2, 2, 9, ...","[Industries, Area and Population, Area and Pop...","[0.24260399548371742, 0.23157152877495513, 0.2...","[No.,Other Service Activities, Total,Populatio..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"[funding, diffusion, amount, approved, total, ...",total amount approved funding promote developm...,33,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[2.11, 2.11, 2.11, 6.4, 18.1, 18.1, 6.4, 11.1,...",[Districtwise Population By Age Groups And Sex...,"[2, 2, 2, 6, 18, 18, 6, 11, 14, 14, 14, 14, 18...","[Area and Population, Area and Population, Are...","[0.10471104174147035, 0.10471104174147035, 0.1...","[Total,0-14, Total,15-59, Total,60, No.,Total,..."
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"[individual, internet, proportion, using]",proportion individual using internet,2,"[533.0, 872.0]","[9.4, 14.2]",[Post Office Telephones Exchanges Telephones. ...,"[9, 14]","[TRANSPORT AND COMMUNICATION, Rural Developmen...","[0.2825673271164115, 0.17691126490402242]","[Internet Connections,Communication In Numbers..."
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"[accordance, principle, sustainable, proportio...",proportion sustainable development indicator p...,8,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[4.18, 5.4, 9.2, 6.5, 14.2, 14.2, 14.2, 14.2]","[Sericulture Year 2016-17 in Nos., Fisheries Y...","[4, 5, 9, 6, 14, 14, 14, 14]","[Agriculture, Horticulture and Sericulture, An...","[0.11681628897132912, 0.11403210208777811, 0.0...","[Value of Silk Produced Rs. in lakhs, Sericult..."
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"[dollar, strengthen, capacity, statistical, va...",dollar value resource made available strengthe...,4,"[386.0, 384.0, 339.0, 27.0]","[5.4, 5.4, 4.18, 1.5]","[Fisheries Year 2016-17, Fisheries Year 2016-1...","[5, 5, 4, 1]","[Animal Husbandry, Animal Husbandry, Agricultu...","[0.1787601212421857, 0.1635585802785323, 0.122...","[Capacity Tonnes,Cold storages, Capacity Tonne..."


In [95]:
sim_threshold.columns

Index(['Goal No.', 'Goal', 'Targets', 'Tentative Indicators', 'Target_id',
       'Indicator_id', 'new_indicators', 'new_indicators_str', 'top_n_count',
       'top_n_attr', 'top_n_table_id', 'top_n_table_name', 'top_n_chapter_id',
       'top_n_chapter_name', 'top_n_similarities', 'top_n_description'],
      dtype='object')

In [97]:
def concat_lists(series):
    if isinstance(series.iloc[0], list):
        return ', '.join(series.iloc[0])
    else:
        return series.iloc[0]

# Assuming 'sim_threshold' is your DataFrame
# Group by the desired columns and apply the concatenation operation using the custom function
sim_threshold_new = sim_threshold.groupby(['Goal No.', 'Goal', 'Targets', 'Tentative Indicators', 'Target_id', 'Indicator_id', 'new_indicators', 'new_indicators_str']).agg({'top_n_attr': concat_lists,
                                                                                                                                          'top_n_table_id': concat_lists,
                                                                                                                                          'top_n_table_name': concat_lists,
                                                                                                                                          'top_n_chapter_id': concat_lists,
                                                                                                                                          'top_n_chapter_name': concat_lists,
                                                                                                                                          'top_n_description': concat_lists}).reset_index()

print(sim_threshold_new)

TypeError: unhashable type: 'list'

In [90]:
# Group by 'Chapter_id' and 'Chapter_name' and concatenate the list items in 'Description'
sim_threashold_new = sim_threashold.groupby(['Goal No.', 'Goal', 'Targets', 'Tentative Indicators', 'Target_id','Indicator_id', 'new_indicators', 'new_indicators_str'])[['top_n_attr', 'top_n_table_id', 'top_n_table_name', 'top_n_chapter_id','top_n_chapter_name', 'top_n_description']].apply(lambda x: ', '.join([item for sublist in x for item in sublist])).reset_index()


TypeError: unhashable type: 'list'

In [122]:
import pandas as pd

# Sample DataFrame (replace with your actual data)
data = {
    'Goal No.': [1, 1, 2, 2],
    'Target_id': [101, 102, 201, 202],
    'Indicator_id': [1001, 1002, 2001, 2002],
    'top_n_attr_sim': [['A', 'B', 'C'], ['B', 'C', 'D'], ['C', 'D', 'E'], ['D', 'E', 'F']],
    'top_n_attr_threshold': [['B', 'C', 'D'], ['C', 'D', 'E'], ['D', 'E', 'F'], ['E', 'F', 'G']],
    'count': [3, 3, 3, 3]  # Sample count column for demonstration
}
df = pd.DataFrame(data)

# Function to find intersection between list elements
def find_intersection(row):
    top_n_attr_sim = set(row['top_n_attr_sim'])
    top_n_attr_threshold = set(row['top_n_attr_threshold'])
    intersection = list(top_n_attr_sim.intersection(top_n_attr_threshold))
    return intersection

# Apply the function to the DataFrame
df['intersection'] = df.apply(find_intersection, axis=1)

df

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_sim,top_n_attr_threshold,count,intersection
0,1,101,1001,"[A, B, C]","[B, C, D]",3,"[B, C]"
1,1,102,1002,"[B, C, D]","[C, D, E]",3,"[D, C]"
2,2,201,2001,"[C, D, E]","[D, E, F]",3,"[D, E]"
3,2,202,2002,"[D, E, F]","[E, F, G]",3,"[F, E]"


In [125]:
import pandas as pd

# Sample DataFrames (replace with your actual data)
data_sim = {'Goal No.': [1, 1], 'Target_id': [1.1, 1.1], 'Indicator_id': [1.1, 1.1], 'top_n_attr_sim': [[1, 2, 3, 4], [3, 4, 5, 6]]}
data_threshold = {'Goal No.': [1, 1], 'Target_id': [1.1, 1.1], 'Indicator_id': [1.1, 1.1], 'top_n_attr_threshold': [[3, 4, 5, 6], [7, 8, 9, 10]]}

result_sdg_sim_new = pd.DataFrame(data_sim)
result_sdg_threshold_new = pd.DataFrame(data_threshold)

# Merge the dataframes based on columns 'Goal No.', 'Target_id', and 'Indicator_id'
merged_df = pd.merge(result_sdg_sim_new, result_sdg_threshold_new, on=['Goal No.', 'Target_id', 'Indicator_id'])

# Function to find the intersection of two lists
def find_intersection(row):
    intersection = list(set(row['top_n_attr_sim']) & set(row['top_n_attr_threshold']))
    return intersection

# Apply the function to the merged dataframe
merged_df['intersection'] = merged_df.apply(find_intersection, axis=1)

merged_df

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_sim,top_n_attr_threshold,intersection
0,1,1.1,1.1,"[1, 2, 3, 4]","[3, 4, 5, 6]","[3, 4]"
1,1,1.1,1.1,"[1, 2, 3, 4]","[7, 8, 9, 10]",[]
2,1,1.1,1.1,"[3, 4, 5, 6]","[3, 4, 5, 6]","[3, 4, 5, 6]"
3,1,1.1,1.1,"[3, 4, 5, 6]","[7, 8, 9, 10]",[]
