In [43]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import matplotlib.pyplot as plt
import nltk
from collections import OrderedDict

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /home/riya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/riya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/riya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Define a custom function to remove word before 1st space and excess white spaces
def process_string(input_string):
    # Find the index of the first space
    first_space_index = input_string.find(' ')
    
    # Remove the word before the first space
    if first_space_index != -1:
        input_string = input_string[first_space_index+1:]
    
    # Remove excess white spaces
    input_string = ' '.join(input_string.split())
    
    return input_string

In [3]:

# Creating our tokenizer function
def cleaning(sentence):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    wn = nltk.WordNetLemmatizer()

    cleaned = []
    for i in range(len(sentence)):
        letters = re.sub('[^a-zA-Z]', ' ', sentence[i])
        letters = letters.lower().split() 
        letters = [wn.lemmatize(w) for w in letters if w not in stopwords ]
        letters_list = list(set(letters))
        cleaned.append(letters_list)

    return cleaned

In [4]:

# Creating our tokenizer function
def concat_cleaning(sentence):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    wn = nltk.WordNetLemmatizer()

    cleaned = []
    for i in range(len(sentence)):
        letters = re.sub('[^a-zA-Z]', ' ', sentence[i])
        letters = letters.lower().split() 
        letters = [wn.lemmatize(w) for w in letters if w not in stopwords ]
        
        letters = ' '.join(letters)
        cleaned.append(letters)

    return cleaned

In [5]:
def concatenate_table_name(df):
    for index, row in df.iterrows():
        if ',' not in row['Description']:
            df.at[index, 'Description'] += ', ' + row['Table_name']
    return df


In [6]:
def count_common_words(row, df_other):
    set_1 = set(row)
    common_words = []
    for _, other_row in df_other.iterrows():
        set_2 = set(other_row['new_description'])
        common_words.append(len(set_1.intersection(set_2)))
    return common_words

In [7]:
def find_similarity_sdg(similarity_matrix, threshold, chapter_df, sdg_df):
    top_n_similarities = []
    top_n_attr = []
    top_n_chapter_id = []
    top_n_chapter_name = []
    top_n_table_id = []
    top_n_table_name = []
    top_n_count = []
    top_n_description = []
    
    n = threshold
    for index, row in similarity_matrix.iterrows():
        top_n_indices = sorted(range(len(row)), key=lambda i: row.values[i], reverse=True)[:n]
        
        # Get top n values, chapters, and chapter names
        values = [row.iloc[i] for i in top_n_indices]
        attr = [similarity_matrix.columns[i] for i in top_n_indices]
        attr_des = [chapter_df['Description'].loc[i] for i in top_n_indices]
        chapter_id = [chapter_df['Chapter_id'].loc[i] for i in top_n_indices]
        chapter_name = [chapter_df['Chapter_name'].loc[i] for i in top_n_indices]   
        table_id = [chapter_df['Table_id'].loc[i] for i in top_n_indices]
        table_name = [chapter_df['Table_name'].loc[i] for i in top_n_indices]   


        # Append to respective lists
        top_n_count.append(len(attr))
        top_n_similarities.append(values)
        top_n_attr.append(attr)
        top_n_description.append(attr_des)
        top_n_chapter_id.append(chapter_id)
        top_n_chapter_name.append(chapter_name)
        top_n_table_id.append(table_id)
        top_n_table_name.append(table_name)
    
    # Assign lists to the SDG DataFrame
    sdg_df['top_n_count'] = top_n_count
    sdg_df['top_n_attr'] = top_n_attr
    sdg_df['top_n_table_id'] = top_n_table_id
    sdg_df['top_n_table_name'] = top_n_table_name
    sdg_df['top_n_chapter_id'] = top_n_chapter_id
    sdg_df['top_n_chapter_name'] = top_n_chapter_name
    sdg_df['top_n_similarities'] = top_n_similarities
    sdg_df['top_n_description'] = top_n_description

    return sdg_df

In [8]:
def find_similarity_above_threshold(similarity_matrix, threshold, chapter_df, sdg_df):
    top_n_similarities = []
    top_n_attr = []
    top_n_chapter_id = []
    top_n_chapter_name = []
    top_n_table_id = []
    top_n_table_name = []
    top_n_count = []
    top_n_description = []
    
    for index, row in similarity_matrix.iterrows():
        # Filter indices based on threshold value
        top_n_indices = [i for i, value in enumerate(row) if value >= threshold]
        
        # Sort indices based on similarity values
        top_n_indices = sorted(top_n_indices, key=lambda i: row.iloc[i], reverse=True)
        
        # Get top n values, chapters, and chapter names
        values = [row.iloc[i] for i in top_n_indices]
        attr = [similarity_matrix.columns[i] for i in top_n_indices]
        attr_des = [chapter_df['Description'].loc[i] for i in top_n_indices]
        chapter_id = [chapter_df['Chapter_id'].loc[i] for i in top_n_indices]
        chapter_name = [chapter_df['Chapter_name'].loc[i] for i in top_n_indices]   
        table_id = [chapter_df['Table_id'].loc[i] for i in top_n_indices]
        table_name = [chapter_df['Table_name'].loc[i] for i in top_n_indices]   


        # Append to respective lists
        top_n_count.append(len(attr))
        top_n_similarities.append(values)
        top_n_attr.append(attr)
        top_n_description.append(attr_des)
        top_n_chapter_id.append(chapter_id)
        top_n_chapter_name.append(chapter_name)
        top_n_table_id.append(table_id)
        top_n_table_name.append(table_name)
    
    # Assign lists to the SDG DataFrame
    sdg_df['top_n_count'] = top_n_count
    sdg_df['top_n_attr'] = top_n_attr
    sdg_df['top_n_table_id'] = top_n_table_id
    sdg_df['top_n_table_name'] = top_n_table_name
    sdg_df['top_n_chapter_id'] = top_n_chapter_id
    sdg_df['top_n_chapter_name'] = top_n_chapter_name
    sdg_df['top_n_similarities'] = top_n_similarities
    sdg_df['top_n_description'] = top_n_description

    return sdg_df


In [106]:
def process_result(result_sdg, sdg_df, chapter_df):
    # Create a dictionary to store chapter details
    chapter_details = {}

    # Iterate over the rows of the SDG DataFrame
    for index, row in result_sdg.iterrows():
        attr_id = row['Rank']
        goal_no = row['Goal No.']
        target_id = row['Target_id']
        indicator_id = row['Indicator_id']
        for ids in range(len(attr_id)):
            if attr_id[ids] not in chapter_details:
                chapter_details[attr_id[ids]] = {
                    'Attr_id' : attr_id[ids],
                    'top_n_goal_id': [goal_no],
                    'top_n_target_id': [target_id],
                    'top_n_indicator_id':[indicator_id],
                }
            else:
                g_id = chapter_details[attr_id[ids]].get('top_n_goal_id')
                t_id = chapter_details[attr_id[ids]].get('top_n_target_id')
                i_id = chapter_details[attr_id[ids]].get('top_n_indicator_id')
                g_id.append(goal_no)
                t_id.append(target_id)
                i_id.append(indicator_id)
                chapter_details[attr_id[ids]].update({'top_n_goal_id': g_id})
                chapter_details[attr_id[ids]].update({'top_n_target_id': t_id})
                chapter_details[attr_id[ids]].update({'top_n_indicator_id': i_id})
                
    # Convert dictionary to list of dictionaries
    rows = []
    for a_id, values in chapter_details.items():
        row = {
            'Attr_id': a_id,
            'top_n_goal_id': values['top_n_goal_id'],
            'top_n_target_id': values['top_n_target_id'],
            'top_n_indicator_id': values['top_n_indicator_id']
        }
        rows.append(row)

    # Create DataFrame
    goals = pd.DataFrame(rows)
    
    # Convert dictionary to list of dictionaries
    all_rows = []
    for a_id, values in chapter_details.items():
        for i in range(len(values['top_n_indicator_id'])):
            row = {
                'Attr_id': values['Attr_id'],
                'top_n_goal_id': values['top_n_goal_id'][i],
                'top_n_target_id': values['top_n_target_id'][i],
                'top_n_indicator_id': values['top_n_indicator_id'][i]
            }
            all_rows.append(row)
            
    # Create DataFrame
    all_goals = pd.DataFrame(all_rows)
    goal_name = dict(zip(sdg_df['Goal No.'], sdg_df['Goal']))
    target_name = dict(zip(sdg_df['Target_id'], sdg_df['Targets']))
    indicator_name = dict(zip(sdg_df['Indicator_id'], sdg_df['Tentative Indicators']))
    all_goals['Goal'] = all_goals['top_n_goal_id'].map(goal_name.get)
    all_goals['Targets'] = all_goals['top_n_target_id'].map(target_name.get)
    all_goals['Tentative Indicators'] = all_goals['top_n_indicator_id'].map(indicator_name.get)
    chapter_id = dict(zip(chapter_df['Attr_id'], chapter_df['Chapter_id']))
    chapter_name = dict(zip(chapter_df['Chapter_id'], chapter_df['Chapter_name']))
    table_id = dict(zip(chapter_df['Attr_id'], chapter_df['Table_id']))
    table_name = dict(zip(chapter_df['Table_id'], chapter_df['Table_name']))
    description = dict(zip(chapter_df['Attr_id'], chapter_df['Description']))
    all_goals['Chapter_id'] = all_goals['Attr_id'].map(chapter_id.get)
    all_goals['Chapter_name'] = all_goals['Chapter_id'].map(chapter_name.get)
    all_goals['Table_id'] = all_goals['Attr_id'].map(table_id.get)
    all_goals['Table_name'] = all_goals['Table_id'].map(table_name.get)
    all_goals['Description'] = all_goals['Attr_id'].map(description.get) 
    
                
    all_goals = all_goals.loc[:,['Attr_id','Chapter_id', 'Chapter_name', 'Table_id', 'Table_name', 'Description', 'Goal', 'Targets', 'Tentative Indicators', 'top_n_goal_id', 'top_n_target_id', 'top_n_indicator_id']]
    
    return goals, all_goals


In [93]:
sdg = pd.read_csv("sdg_data_excel.csv")

In [94]:
sdg.drop(['Nodal Department',
       'Other Related Major Departments'],axis=1,inplace=True)

In [95]:
# Remove duplicate rows
sdg = sdg.drop_duplicates()

In [96]:
sdg.reset_index(inplace = True, drop = True)

In [97]:
# Apply the custom function to each cell in the 'Column' column
sdg['Tentative Indicators'] = sdg['Tentative Indicators'].apply(process_string)
sdg['Targets'] = sdg['Targets'].apply(process_string) 
sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1
...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1


In [58]:
sdg_string = pd.read_csv("Att_Ind_String.csv", sep=";", converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})

In [59]:
sdg_string

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"['age', 'urban', 'international', 'population'...",proportion population international poverty li...,140,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[2.1, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, ...",['Population and percentage share to total Pop...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","['Area and Population', 'Area and Population',...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","['Geographical Area Sq.Kms, Population and per..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"['age', 'population', 'sex', 'living', 'propor...",proportion population living national poverty ...,70,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 17.2, 2.1...",['No .of Children in the age group 6 to 14 yea...,"[10, 10, 10, 10, 10, 10, 17, 2, 2, 2, 2, 2, 2,...","['Education', 'Education', 'Education', 'Educa...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Boys,No.of Children Enrolment age group betw..."
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"['child', 'age', 'dimension', 'men', 'accordin...",proportion men woman child age living poverty ...,41,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 17.2, 2.4...",['No .of Children in the age group 6 to 14 yea...,"[10, 10, 10, 10, 10, 10, 17, 2, 2, 2, 2, 2, 2,...","['Education', 'Education', 'Education', 'Educa...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Boys,No.of Children Enrolment age group betw..."
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"['population', 'disability', 'percentage', 'co...",percentage population covered social protectio...,84,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[17.2, 17.2, 2.1, 2.1, 2.1, 2.4, 2.4, 2.4, 2.4...",['No.of Pensioners under Social Schemes As on ...,"[17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...","['Miscellaneous', 'Miscellaneous', 'Area and P...","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","['Old Age Pensioners, No.of Pensioners under S..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"['population', 'living', 'access', 'proportion...",proportion population living household access ...,62,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, ...",['Population and percentage share to total Pop...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","['Area and Population', 'Area and Population',...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Geographical Area Sq.Kms, Population and per..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"['approved', 'sound', 'development', 'dissemin...",total amount approved funding promote developm...,303,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[1.2, 1.2, 1.3, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, ...",['No.of Taluks Inhabited Un-inhabited and Tota...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...","['General Information', 'General Information',...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Cities Towns Urban, No.of Taluks Inhabited U..."
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"['internet', 'using', 'proportion', 'individual']",proportion individual using internet,2,"[533.0, 872.0]","[9.4, 14.2]",['Post Office Telephones Exchanges Telephones....,"[9, 14]","['TRANSPORT AND COMMUNICATION', 'Rural Develop...","[1, 1]","['Internet Connections,Communication In Number..."
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"['disaggregation', 'state', 'development', 'pr...",proportion sustainable development indicator p...,8,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[4.18, 5.4, 6.5, 9.2, 14.2, 14.2, 14.2, 14.2]","['Sericulture Year 2016-17 in Nos.', 'Fisherie...","[4, 5, 6, 9, 14, 14, 14, 14]","['Agriculture, Horticulture and Sericulture', ...","[1, 1, 1, 1, 1, 1, 1, 1]","['Value of Silk Produced Rs. in lakhs, Sericul..."
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"['available', 'statistical', 'strengthen', 'ma...",dollar value resource made available strengthe...,4,"[27.0, 339.0, 384.0, 386.0]","[1.5, 4.18, 5.4, 5.4]","['No.of Fire Stations on In Nos', 'Sericulture...","[1, 4, 5, 5]","['General Information', 'Agriculture, Horticul...","[1, 1, 1, 1]",['Value of the property protected Rs.in Crores...


In [100]:
data = pd.read_csv("Attributes_2016-17.csv", sep=";")

In [101]:
data_new = concatenate_table_name(data)
data_new

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description
0,3.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Nada Offices, Nada Offices Village Accountant ..."
1,4.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Va Circles, Nada Offices Village Accountant Ci..."
2,5.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Hoblies, Nada Offices Village Accountant Circl..."
3,6.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"Grama Panchayaths, Nada Offices Village Accoun..."
4,7.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,"No.of Taluks, Nada Offices Village Accountant ..."
...,...,...,...,...,...,...
1162,1169.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Total,Number,Total Agrl. Land Holder Total"
1163,1170.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Male,Area,Total Agrl. Land Holder Total"
1164,1171.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Female,Area,Total Agrl. Land Holder Total"
1165,1172.0,18,Additional information,18.1,Agricultural Land Holdings And Area per 2010-1...,"Institutions,Area,Total Agrl. Land Holder Total"


In [62]:
sdg_sim = pd.read_csv("Att_Ind_Sim.csv", sep=";", converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_sim

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"['age', 'urban', 'international', 'population'...",proportion population international poverty li...,10,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[2.2, 2.2, 2.3, 2.3, 2.1, 4.1, 2.3, 2.3, 10.8,...",['Rural and Urban Population and Decadal chang...,"[2, 2, 2, 2, 2, 4, 2, 2, 10, 2]","['Area and Population', 'Area and Population',...","[0.2747968656671603, 0.268150135206932, 0.2240...","['Urban,Population 2011', 'Rural,Population 20..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"['age', 'population', 'sex', 'living', 'propor...",proportion population living national poverty ...,10,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 2.1, 2.2,...",['No .of Children in the age group 6 to 14 yea...,"[10, 10, 10, 10, 10, 10, 2, 2, 2, 2]","['Education', 'Education', 'Education', 'Educa...","[0.26541303169689845, 0.24930426228926683, 0.2...","['Total,No.of Children out of school age group..."
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"['child', 'age', 'dimension', 'men', 'accordin...",proportion men woman child age living poverty ...,10,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[10.8, 10.11, 10.8, 10.8, 10.8, 10.8, 10.8, 14...",['No .of Children in the age group 6 to 14 yea...,"[10, 10, 10, 10, 10, 10, 10, 14, 10, 2]","['Education', 'Education', 'Education', 'Educa...","[0.22885060867758872, 0.22697420351274686, 0.2...","['Total,No.of Children out of school age group..."
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"['population', 'disability', 'percentage', 'co...",percentage population covered social protectio...,10,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[17.2, 13.5, 2.4, 17.2, 2.4, 2.4, 10.8, 2.5, 2...",['No.of Pensioners under Social Schemes As on ...,"[17, 13, 2, 17, 2, 2, 10, 2, 2, 10]","['Miscellaneous', 'Women & Child Development',...","[0.1807125515613829, 0.16616236668605194, 0.16...","['Old Age Pensioners, No.of Pensioners under S..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"['population', 'living', 'access', 'proportion...",proportion population living household access ...,10,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[6.4, 2.1, 2.2, 15.2, 15.3, 6.4, 2.1, 2.1, 9.1...",['Small Scale units registered in District Ind...,"[6, 2, 2, 15, 15, 6, 2, 2, 9, 2]","['Industries', 'Area and Population', 'Area an...","[0.24260399548371742, 0.23157152877495513, 0.2...","['No.,Other Service Activities', 'Total,Popula..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"['approved', 'sound', 'development', 'dissemin...",total amount approved funding promote developm...,10,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[2.11, 2.11, 2.11, 6.4, 18.1, 18.1, 6.4, 11.1,...",['Districtwise Population By Age Groups And Se...,"[2, 2, 2, 6, 18, 18, 6, 11, 14, 14]","['Area and Population', 'Area and Population',...","[0.10471104174147035, 0.10471104174147035, 0.1...","['Total,0-14', 'Total,15-59', 'Total,60', 'No...."
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"['internet', 'using', 'proportion', 'individual']",proportion individual using internet,10,"[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[9.4, 14.2, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2,...",['Post Office Telephones Exchanges Telephones....,"[9, 14, 1, 1, 1, 1, 1, 1, 1, 1]","['TRANSPORT AND COMMUNICATION', 'Rural Develop...","[0.2825673271164115, 0.17691126490402242, 0.0,...","['Internet Connections,Communication In Number..."
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"['disaggregation', 'state', 'development', 'pr...",proportion sustainable development indicator p...,10,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[4.18, 5.4, 9.2, 6.5, 14.2, 14.2, 14.2, 14.2, ...","['Sericulture Year 2016-17 in Nos.', 'Fisherie...","[4, 5, 9, 6, 14, 14, 14, 14, 1, 1]","['Agriculture, Horticulture and Sericulture', ...","[0.11681628897132912, 0.11403210208777811, 0.0...","['Value of Silk Produced Rs. in lakhs, Sericul..."
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"['available', 'statistical', 'strengthen', 'ma...",dollar value resource made available strengthe...,10,"[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[5.4, 5.4, 4.18, 1.5, 1.1, 1.1, 1.1, 1.1, 1.1,...","['Fisheries Year 2016-17', 'Fisheries Year 201...","[5, 5, 4, 1, 1, 1, 1, 1, 1, 1]","['Animal Husbandry', 'Animal Husbandry', 'Agri...","[0.1787601212421857, 0.1635585802785323, 0.122...","['Capacity Tonnes,Cold storages', 'Capacity To..."


In [63]:
sdg_threshold = pd.read_csv("Att_Ind_Threshold.csv", sep=";", converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_threshold

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"['age', 'urban', 'international', 'population'...",proportion population international poverty li...,128,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[2.2, 2.2, 2.3, 2.3, 2.1, 4.1, 2.3, 2.3, 10.8,...",['Rural and Urban Population and Decadal chang...,"[2, 2, 2, 2, 2, 4, 2, 2, 10, 2, 10, 10, 10, 2,...","['Area and Population', 'Area and Population',...","[0.2747968656671603, 0.268150135206932, 0.2240...","['Urban,Population 2011', 'Rural,Population 20..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"['age', 'population', 'sex', 'living', 'propor...",proportion population living national poverty ...,70,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 2.1, 2.2,...",['No .of Children in the age group 6 to 14 yea...,"[10, 10, 10, 10, 10, 10, 2, 2, 2, 2, 2, 2, 2, ...","['Education', 'Education', 'Education', 'Educa...","[0.26541303169689845, 0.24930426228926683, 0.2...","['Total,No.of Children out of school age group..."
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"['child', 'age', 'dimension', 'men', 'accordin...",proportion men woman child age living poverty ...,41,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[10.8, 10.11, 10.8, 10.8, 10.8, 10.8, 10.8, 14...",['No .of Children in the age group 6 to 14 yea...,"[10, 10, 10, 10, 10, 10, 10, 14, 10, 2, 2, 2, ...","['Education', 'Education', 'Education', 'Educa...","[0.22885060867758872, 0.22697420351274686, 0.2...","['Total,No.of Children out of school age group..."
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"['population', 'disability', 'percentage', 'co...",percentage population covered social protectio...,78,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[17.2, 13.5, 2.4, 17.2, 2.4, 2.4, 10.8, 2.5, 2...",['No.of Pensioners under Social Schemes As on ...,"[17, 13, 2, 17, 2, 2, 10, 2, 2, 10, 10, 2, 2, ...","['Miscellaneous', 'Women & Child Development',...","[0.1807125515613829, 0.16616236668605194, 0.16...","['Old Age Pensioners, No.of Pensioners under S..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"['population', 'living', 'access', 'proportion...",proportion population living household access ...,62,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[6.4, 2.1, 2.2, 15.2, 15.3, 6.4, 2.1, 2.1, 9.1...",['Small Scale units registered in District Ind...,"[6, 2, 2, 15, 15, 6, 2, 2, 9, 2, 15, 2, 2, 9, ...","['Industries', 'Area and Population', 'Area an...","[0.24260399548371742, 0.23157152877495513, 0.2...","['No.,Other Service Activities', 'Total,Popula..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"['approved', 'sound', 'development', 'dissemin...",total amount approved funding promote developm...,33,"[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[2.11, 2.11, 2.11, 6.4, 18.1, 18.1, 6.4, 11.1,...",['Districtwise Population By Age Groups And Se...,"[2, 2, 2, 6, 18, 18, 6, 11, 14, 14, 14, 14, 18...","['Area and Population', 'Area and Population',...","[0.10471104174147035, 0.10471104174147035, 0.1...","['Total,0-14', 'Total,15-59', 'Total,60', 'No...."
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"['internet', 'using', 'proportion', 'individual']",proportion individual using internet,2,"[533.0, 872.0]","[9.4, 14.2]",['Post Office Telephones Exchanges Telephones....,"[9, 14]","['TRANSPORT AND COMMUNICATION', 'Rural Develop...","[0.2825673271164115, 0.17691126490402242]","['Internet Connections,Communication In Number..."
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"['disaggregation', 'state', 'development', 'pr...",proportion sustainable development indicator p...,8,"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[4.18, 5.4, 9.2, 6.5, 14.2, 14.2, 14.2, 14.2]","['Sericulture Year 2016-17 in Nos.', 'Fisherie...","[4, 5, 9, 6, 14, 14, 14, 14]","['Agriculture, Horticulture and Sericulture', ...","[0.11681628897132912, 0.11403210208777811, 0.0...","['Value of Silk Produced Rs. in lakhs, Sericul..."
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"['available', 'statistical', 'strengthen', 'ma...",dollar value resource made available strengthe...,4,"[386.0, 384.0, 339.0, 27.0]","[5.4, 5.4, 4.18, 1.5]","['Fisheries Year 2016-17', 'Fisheries Year 201...","[5, 5, 4, 1]","['Animal Husbandry', 'Animal Husbandry', 'Agri...","[0.1787601212421857, 0.1635585802785323, 0.122...","['Capacity Tonnes,Cold storages', 'Capacity To..."


In [64]:
result_sdg_new =sdg_string.copy()
result_sdg_sim_new =sdg_sim.copy()
result_sdg_threshold_new = sdg_threshold.copy()

In [65]:
result_sdg_new = result_sdg_new[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]

In [66]:
result_sdg_new.head()

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77...."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65...."


In [67]:
result_sdg_sim_new = result_sdg_sim_new[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]

In [68]:
result_sdg_sim_new.head()

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."


In [69]:
result_sdg_threshold_new = result_sdg_threshold_new[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]

In [70]:
result_sdg_threshold_new.head()

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr
0,1,1.1,1.1.1,"[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."


In [71]:
result_sdg_new.rename(columns = {'top_n_attr':'top_n_attr_A'}, inplace=True)

In [72]:
result_sdg_new['top_n_attr_B'] = result_sdg_threshold_new.loc[:,'top_n_attr']
result_sdg_new['top_n_attr_C'] = result_sdg_sim_new.loc[:,'top_n_attr']

In [73]:
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."
...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9..."
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875..."
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0..."


In [74]:
# Function to combine and remove duplicates from lists while maintaining order
def combine_lists(row):
    list_B = row['top_n_attr_B']
    list_C = row['top_n_attr_C']
    
    # Combine lists and remove duplicates while maintaining order
    #combined_top_n_attr = sorted(set(list_B + list_C), key=lambda x: (list_B + list_C).index(x))
    items = list_B + list_C
    item_set = list(OrderedDict.fromkeys(items))    
    return item_set

# Apply the function to the merged dataframe
result_sdg_new['B_plus_C'] = result_sdg_new.apply(combine_lists, axis=1)

In [75]:
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."
...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9..."
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875..."
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0..."


In [76]:
B_plus_C = result_sdg_new.copy()

In [77]:
B_plus_C.to_csv("B_plus_C.csv", sep=';')

In [80]:
# Function to find intersection while maintaining order
def find_intersection(row):
    intersection = [x for x in row['top_n_attr_A'] if x in row['B_plus_C']]
    return intersection


# Apply the function to the merged dataframe
result_sdg_new['A_and_BC'] = result_sdg_new.apply(find_intersection, axis=1)


In [81]:
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77...."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65...."
...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874..."
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[27.0, 339.0, 384.0, 386.0]"


In [82]:
def find_difference(row):
    top_n_attr = row['top_n_attr_A']
    intersection = row['A_and_BC']
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference
# Apply the function to the merged dataframe
result_sdg_new['A'] = result_sdg_new.apply(find_difference, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[8.0, 523.0, 524.0, 525.0, 872.0, 873.0, 874.0..."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[]
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[]
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[624.0, 671.0, 840.0, 841.0, 842.0, 863.0]"
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....",[]
...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,...","[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0]",[]
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...",[]
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[27.0, 339.0, 384.0, 386.0]",[]


In [83]:
# Function to find intersection while maintaining order
def find_intersection_BC(row):
    intersection = [x for x in row['top_n_attr_B'] if x in row['top_n_attr_C']]
    return intersection


# Apply the function to the merged dataframe
result_sdg_new['B_and_C'] = result_sdg_new.apply(find_intersection_BC, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[8.0, 523.0, 524.0, 525.0, 872.0, 873.0, 874.0...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74..."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59...."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[624.0, 671.0, 840.0, 841.0, 842.0, 863.0]","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....",[],"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,..."
...,...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,...","[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0]",[],"[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...",[],"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875..."
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[27.0, 339.0, 384.0, 386.0]",[],"[386.0, 384.0, 339.0, 27.0]"


In [89]:
def find_difference_B(row):
    top_n_attr = row['top_n_attr_B']
    intersection = set(row['A'] + row['B_and_C'] + row['A_and_BC'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference
# Apply the function to the merged dataframe
result_sdg_new['B_minus_all'] = result_sdg_new.apply(find_difference_B, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all,Rank
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[8.0, 523.0, 524.0, 525.0, 872.0, 873.0, 874.0...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[],[],"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77...."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[],[],"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[],[],"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[624.0, 671.0, 840.0, 841.0, 842.0, 863.0]","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[],[],"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....",[],"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[],[],"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,...","[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[],[],"[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0]",[],"[533.0, 872.0]",[],[],"[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...",[],"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[],[],"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874..."
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[27.0, 339.0, 384.0, 386.0]",[],"[386.0, 384.0, 339.0, 27.0]",[],[],"[27.0, 339.0, 384.0, 386.0]"


In [90]:
def find_difference_C(row):
    top_n_attr = row['top_n_attr_C']
    intersection = set(row['A'] + row['B_and_C'] + row['A_and_BC'] + row['B_minus_all'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference
# Apply the function to the merged dataframe
result_sdg_new['C_minus_all'] = result_sdg_new.apply(find_difference_C, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all,Rank
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[8.0, 523.0, 524.0, 525.0, 872.0, 873.0, 874.0...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[],[],"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77...."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[],[],"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[],[],"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[624.0, 671.0, 840.0, 841.0, 842.0, 863.0]","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[],[],"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....",[],"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[],[],"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,...","[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[],[],"[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0]",[],"[533.0, 872.0]",[],"[3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]","[533.0, 872.0]"
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...",[],"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[],"[3.0, 4.0]","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874..."
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[27.0, 339.0, 384.0, 386.0]",[],"[386.0, 384.0, 339.0, 27.0]",[],"[3.0, 4.0, 5.0, 6.0, 7.0, 8.0]","[27.0, 339.0, 384.0, 386.0]"


In [91]:
result_sdg_new.columns

Index(['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr_A', 'top_n_attr_B',
       'top_n_attr_C', 'B_plus_C', 'A_and_BC', 'A', 'B_and_C', 'B_minus_all',
       'C_minus_all', 'Rank'],
      dtype='object')

In [92]:
def final_rank(row):
    #combined_top_n_attr = sorted(set(list_B + list_C), key=lambda x: (list_B + list_C).index(x))
    items = row['A_and_BC'] + row['A'] + row['B_and_C'] + row['B_minus_all'] + row['C_minus_all']
    item_set = list(OrderedDict.fromkeys(items))    
    return item_set

# Apply the function to the merged dataframe
result_sdg_new['Rank'] = result_sdg_new.apply(final_rank, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all,Rank
0,1,1.1,1.1.1,"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...","[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77....","[8.0, 523.0, 524.0, 525.0, 872.0, 873.0, 874.0...","[66.0, 65.0, 72.0, 75.0, 58.0, 156.0, 71.0, 74...",[],[],"[58.0, 65.0, 66.0, 71.0, 72.0, 74.0, 75.0, 77...."
1,1,1.2,1.2.1,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 579.0, 578.0, 577.0, 576.0, 575.0, 59....",[],[],"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
2,1,1.2,1.2.2,"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...","[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963...",[],"[580.0, 602.0, 579.0, 578.0, 577.0, 576.0, 575...",[],[],"[575.0, 576.0, 577.0, 578.0, 579.0, 580.0, 963..."
3,1,1.3,1.3.1,"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...","[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7...","[624.0, 671.0, 840.0, 841.0, 842.0, 863.0]","[962.0, 857.0, 76.0, 963.0, 77.0, 78.0, 580.0,...",[],[],"[962.0, 963.0, 58.0, 62.0, 63.0, 76.0, 77.0, 7..."
4,1,1.4,1.4.1,"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...","[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65....",[],"[440.0, 59.0, 64.0, 902.0, 908.0, 441.0, 61.0,...",[],[],"[58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,16,16.4,16.4.1,"[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...","[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,...","[8.0, 11.0, 17.0, 30.0, 33.0, 36.0, 39.0, 42.0...","[122.0, 125.0, 128.0, 442.0, 1173.0, 1169.0, 4...",[],[],"[48.0, 59.0, 64.0, 122.0, 125.0, 128.0, 442.0,..."
165,16,16.5,16.5.1,"[533.0, 872.0]","[533.0, 872.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9...","[533.0, 872.0]",[],"[533.0, 872.0]",[],"[3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]","[533.0, 872.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9..."
166,17,17.1,17.1.1,"[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874...",[],"[339.0, 389.0, 520.0, 446.0, 874.0, 873.0, 875...",[],"[3.0, 4.0]","[339.0, 389.0, 446.0, 520.0, 872.0, 873.0, 874..."
167,17,17.2,17.2.1,"[27.0, 339.0, 384.0, 386.0]","[386.0, 384.0, 339.0, 27.0]","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[386.0, 384.0, 339.0, 27.0, 3.0, 4.0, 5.0, 6.0...","[27.0, 339.0, 384.0, 386.0]",[],"[386.0, 384.0, 339.0, 27.0]",[],"[3.0, 4.0, 5.0, 6.0, 7.0, 8.0]","[27.0, 339.0, 384.0, 386.0, 3.0, 4.0, 5.0, 6.0..."


In [103]:
final_df = result_sdg_new.copy()

In [104]:
final_df.to_csv("Ranking", sep=';', index=False)

In [107]:
goals, all_goals = process_result(final_df, sdg, data)

In [108]:
all_goals

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id
0,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
1,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1,1.2,1.2.1
2,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1,1.3,1.3.1
3,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...",End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1,1.4,1.4.1
4,58.0,2,Area and Population,2.1,Population and percentage share to total Popul...,"Geographical Area Sq.Kms, Population and perce...","End hunger, achieve food security and improved...","By 2030, end hunger and ensure access by all p...",Prevalence of moderate or severe food insecuri...,2,2.1,\n2.1.2
...,...,...,...,...,...,...,...,...,...,...,...,...
19384,21.0,1,General Information,1.4,Cinema Theatres Police Station Prisons and Pri...,"Prisons,Prisons and Prisoners 31-3-2017",Promote peaceful and inclusive societies for s...,Promote the rule of law and ensure equal acces...,Unsentenced detainees as a percentage of overa...,15,15.3,15.3.2
19385,660.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"Govt.,No.of Hospitals",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2
19386,663.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of Govt. Doctors, No.of Hospitals Doctors B...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2
19387,664.0,11,Health & Family welfare Services,11.2,No.of Hospitals Doctors Beds As on 31-3-2017 I...,"No.of beds in Govt. Hospitals, No.of Hospitals...",Promote peaceful and inclusive societies for s...,Ensure public access to information and protec...,Percentage of Govt. Departments / Organization...,15,15.9,15.9.2


In [109]:
goals

Unnamed: 0,Attr_id,top_n_goal_id,top_n_target_id,top_n_indicator_id
0,58.0,"[1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 2.3, 2.4, 2.4, 2.5, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 2.3.2, 2..."
1,65.0,"[1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 5, 6, 6, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 3.3, 3.3, 3.5, 3.7, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 3.3.1, 3..."
2,66.0,"[1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 5, 6, 6, ...","[1.1, 1.2, 1.3, 1.4, 2.1, 3.3, 3.3, 3.5, 3.7, ...","[1.1.1, 1.2.1, 1.3.1, 1.4.1, \n2.1.2, 3.3.1, 3..."
3,71.0,"[1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, ...","[1.1, 1.2, 1.3, 3.3, 4.1, 4.2, 4.5, 4.6, 4.7, ...","[1.1.1, 1.2.1, 1.3.1, 3.3.1, 4.1.1, 4.2.1, 4.5..."
4,72.0,"[1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, ...","[1.1, 1.2, 1.3, 3.3, 4.1, 4.2, 4.5, 4.6, 4.7, ...","[1.1.1, 1.2.1, 1.3.1, 3.3.1, 4.1.1, 4.2.1, 4.5..."
...,...,...,...,...
1075,21.0,[15],[15.3],[15.3.2]
1076,660.0,[15],[15.9],[15.9.2]
1077,663.0,[15],[15.9],[15.9.2]
1078,664.0,[15],[15.9],[15.9.2]


In [110]:
all_goals.to_csv("Ranking_all_goals", sep=';', index=False)

In [111]:
goals.to_csv("Ranking_goals", sep=';', index=False)