In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import matplotlib.pyplot as plt
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /home/riya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/riya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/riya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  "class": algorithms.Blowfish,


In [2]:
# Define a custom function to remove word before 1st space and excess white spaces
def process_string(input_string):
    # Find the index of the first space
    first_space_index = input_string.find(' ')
    
    # Remove the word before the first space
    if first_space_index != -1:
        input_string = input_string[first_space_index+1:]
    
    # Remove excess white spaces
    input_string = ' '.join(input_string.split())
    
    return input_string

In [3]:
# Creating our tokenizer function
def cleaning(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    stopwords = nltk.corpus.stopwords.words('english')

    cleaned = []
    for i in range(len(sentence)):
        #non English character
        letters = re.sub('[^a-zA-Z]', ' ', sentence[i])
        #print("Non English : ",letters)

        #lowercase
        letters = letters.lower().split() 

        wn = nltk.WordNetLemmatizer()
        letters = [wn.lemmatize(w) for w in letters if w not in stopwords ]

        #ignore_words
        letters = ' '.join(letters)
        cleaned.append(letters)

    # return preprocessed list of tokens
    return cleaned

In [4]:
sdg = pd.read_csv("sdg_data_excel.csv")

In [5]:
sdg

Unnamed: 0,Goal No.,Goal,Nodal Department,Targets,Other Related Major Departments,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,Rural Development,"1.1 By 2030, eradicate extreme poverty for ...","Urban Development, Agriculture, Horticulture, ...",1.1.1 Proportion of the population below...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,Rural Development,"1.2 By 2030, reduce at least by ...","Urban Development, Agriculture, Horticulture, ...",1.2.1 Proportion of the population livin...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,Rural Development,"1.2 By 2030, reduce at least by ...","Urban Development, Agriculture, Horticulture, ...","1.2.2 Proportion of men, women and...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,Rural Development,1.3 Implement nationally appropriate so...,"Urban Development, Agriculture, Horticulture, ...",1.3.1 Percentage of the populati...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,Rural Development,"1.4 By 2030, ensure that all men and women,...","Urban Development, Agriculture, Horticulture, ...",1.4.1 Proportion of the population living in ...,1.4,1.4.1
...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,Finance,"16.4 Promote the development, tran...",Environment and Scientific Technology.,16.4.1 Total amount of approved funding t...,16.4,16.4.1
165,16,Strengthen the means of implementation and rev...,Finance,16.5 Fully operationalize the technology ...,"Environment and Scientific Technology, Inform...",16.5.1 Proportion of individuals using th...,16.5,16.5.1
166,17,"Data, monitoring and accountability",Finance,"17.1 By 2020, enhance capacity-buil...","Planning, Finance, Economic and Statistics.",17.1.1 Proportion of sustain...,17.1,17.1.1
167,17,"Data, monitoring and accountability",Finance,"17.2 By 2030, build on existing initiati...","Planning, Finance, Economic and Statistics.",17.2.1 Dollar value of all resources made avai...,17.2,17.2.1


In [6]:
sdg.columns

Index(['Goal No.', 'Goal', 'Nodal Department', 'Targets',
       'Other Related Major Departments', 'Tentative Indicators', 'Target_id',
       'Indicator_id'],
      dtype='object')

In [7]:
sdg.drop(['Nodal Department',
       'Other Related Major Departments',
       'Indicator_id'],axis=1,inplace=True)

In [8]:
# Remove duplicate rows
sdg = sdg.drop_duplicates()

In [9]:
sdg.reset_index(inplace = True, drop = True)

In [10]:
sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id
0,1,End poverty in all its forms everywhere,"1.1 By 2030, eradicate extreme poverty for ...",1.1.1 Proportion of the population below...,1.1
1,1,End poverty in all its forms everywhere,"1.2 By 2030, reduce at least by ...",1.2.1 Proportion of the population livin...,1.2
2,1,End poverty in all its forms everywhere,"1.2 By 2030, reduce at least by ...","1.2.2 Proportion of men, women and...",1.2
3,1,End poverty in all its forms everywhere,1.3 Implement nationally appropriate so...,1.3.1 Percentage of the populati...,1.3
4,1,End poverty in all its forms everywhere,"1.4 By 2030, ensure that all men and women,...",1.4.1 Proportion of the population living in ...,1.4
...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"16.4 Promote the development, tran...",16.4.1 Total amount of approved funding t...,16.4
165,16,Strengthen the means of implementation and rev...,16.5 Fully operationalize the technology ...,16.5.1 Proportion of individuals using th...,16.5
166,17,"Data, monitoring and accountability","17.1 By 2020, enhance capacity-buil...",17.1.1 Proportion of sustain...,17.1
167,17,"Data, monitoring and accountability","17.2 By 2030, build on existing initiati...",17.2.1 Dollar value of all resources made avai...,17.2


In [11]:
# Apply the custom function to each cell in the 'Column' column
sdg['Tentative Indicators'] = sdg['Tentative Indicators'].apply(process_string)
sdg['Targets'] = sdg['Targets'].apply(process_string) 
sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4
...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2


In [12]:
sdg_new = sdg.groupby(['Goal No.', 'Goal', 'Targets',  'Target_id'])['Tentative Indicators'].apply(lambda x: ', '.join(x)).reset_index()
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"Number of deaths, missing people, injured, rel..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,Proportion of the population living in househo...
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,Proportion of the population below the interna...
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,Proportion of the population living below the ...
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Number of state action plans related to multil...
...,...,...,...,...,...
127,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Proportion of individuals using the Internet.
128,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,Total amount of approved funding to promote th...
129,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,Total government revenue (by source) as a perc...
130,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",17.1,Proportion of sustainable development indicato...


In [13]:
sdg_new['concat_value'] = sdg_new['Targets'] + ' ' + sdg_new['Tentative Indicators']
sdg_new.drop(['Tentative Indicators'], axis=1,inplace=True)

sdg_new.rename(columns = {'concat_value':'Tentative Indicators'}, inplace = True) 
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par..."
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo..."
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio..."
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...
...,...,...,...,...,...
127,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Fully operationalize the technology bank and s...
128,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,"Promote the development, transfer, disseminati..."
129,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,"Strengthen domestic resource mobilization, inc..."
130,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",17.1,"By 2020, enhance capacity-building support to ..."


In [14]:
texto_data=sdg_new['Tentative Indicators'].tolist()
process_text=cleaning(texto_data)

cleaned_sent =pd.DataFrame(process_text)
sdg_new['new_targets']=cleaned_sent

In [15]:
type(sdg_new['new_targets'].loc[0])

str

In [16]:
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators,new_targets
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ...",build resilience poor vulnerable situation red...
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par...",ensure men woman particular poor vulnerable eq...
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo...",eradicate extreme poverty people everywhere cu...
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio...",reduce least half proportion men woman child a...
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...,create sound policy framework state level base...
...,...,...,...,...,...,...
127,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Fully operationalize the technology bank and s...,fully operationalize technology bank science t...
128,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,"Promote the development, transfer, disseminati...",promote development transfer dissemination dif...
129,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,"Strengthen domestic resource mobilization, inc...",strengthen domestic resource mobilization incl...
130,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",17.1,"By 2020, enhance capacity-building support to ...",enhance capacity building support increase sig...


In [17]:
data = pd.read_csv("Attributes_2016_17.csv", sep=";")

In [18]:
data.columns

Index(['Attr_id', 'Chapter_id', 'Chapter_name', 'Table_id', 'Table_name',
       'Description'],
      dtype='object')

In [19]:
data.drop(['Attr_id', 'Table_id'],axis=1,inplace=True)

In [20]:
data

Unnamed: 0,Chapter_id,Chapter_name,Table_name,Description
0,1,General Information,Nada Offices Village Accountant Circles Hoblie...,Nada Offices
1,1,General Information,Nada Offices Village Accountant Circles Hoblie...,Va Circles
2,1,General Information,Nada Offices Village Accountant Circles Hoblie...,Hoblies
3,1,General Information,Nada Offices Village Accountant Circles Hoblie...,Grama Panchayaths
4,1,General Information,Nada Offices Village Accountant Circles Hoblie...,No.of Taluks
...,...,...,...,...
1162,18,Additional information,Agricultural Land Holdings And Area per 2010-1...,"Total,Number,Total Agrl. Land Holder Total"
1163,18,Additional information,Agricultural Land Holdings And Area per 2010-1...,"Male,Area,Total Agrl. Land Holder Total"
1164,18,Additional information,Agricultural Land Holdings And Area per 2010-1...,"Female,Area,Total Agrl. Land Holder Total"
1165,18,Additional information,Agricultural Land Holdings And Area per 2010-1...,"Institutions,Area,Total Agrl. Land Holder Total"


In [21]:
data_new = data.groupby(['Chapter_id', 'Chapter_name', 'Table_name'])['Description'].apply(lambda x: ', '.join(x)).reset_index()
data_new

Unnamed: 0,Chapter_id,Chapter_name,Table_name,Description
0,1,General Information,Bpl Priority Card Holders on 31-3-2017 In Nos,"Urban,Akshaya with cylinder,Bpl Card Holders, ..."
1,1,General Information,Cinema Theatres Police Station Prisons and Pri...,"Permanent,Cinema Theatres, Temporary,Cinema Th..."
2,1,General Information,District Income Of Karnataka,Gross District Domestic Product Gddp Rs. in La...
3,1,General Information,Nada Offices Village Accountant Circles Hoblie...,"Nada Offices, Va Circles, Hoblies, Grama Panch..."
4,1,General Information,No.of A.p.l. Non-Priority Card Holders on 31-3...,"Urban,With Cylinder,Apl Card Holders, Rural,Wi..."
...,...,...,...,...
128,17,Miscellaneous,Registration of Birth and Death,"Registration Units Rural Urban, Birth,Register..."
129,18,Additional information,Agricultural Land Holdings And Area 2010-11,"Male,Number,Semi Medium Agril. Land Holder 2-4..."
130,18,Additional information,Agricultural Land Holdings And Area per 2010-1...,"Male,Number,Total Agrl. Land Holder Others, Fe..."
131,18,Additional information,Agricultural Land Holdings And Area per 2010-1...,"Male,Number,Marginal Agril. Land Holder Below ..."


In [22]:
data_new['concat_value'] = data_new['Table_name'] + ' ' + data_new['Description']

# Create a new DataFrame with selected columns
new_df = data_new[['Chapter_id', 'Chapter_name', 'Description']]
new_df

Unnamed: 0,Chapter_id,Chapter_name,Description
0,1,General Information,"Urban,Akshaya with cylinder,Bpl Card Holders, ..."
1,1,General Information,"Permanent,Cinema Theatres, Temporary,Cinema Th..."
2,1,General Information,Gross District Domestic Product Gddp Rs. in La...
3,1,General Information,"Nada Offices, Va Circles, Hoblies, Grama Panch..."
4,1,General Information,"Urban,With Cylinder,Apl Card Holders, Rural,Wi..."
...,...,...,...
128,17,Miscellaneous,"Registration Units Rural Urban, Birth,Register..."
129,18,Additional information,"Male,Number,Semi Medium Agril. Land Holder 2-4..."
130,18,Additional information,"Male,Number,Total Agrl. Land Holder Others, Fe..."
131,18,Additional information,"Male,Number,Marginal Agril. Land Holder Below ..."


In [23]:
new_df = new_df.groupby(['Chapter_id', 'Chapter_name',])['Description'].apply(lambda x: ', '.join(x)).reset_index()
new_df

Unnamed: 0,Chapter_id,Chapter_name,Description
0,1,General Information,"Urban,Akshaya with cylinder,Bpl Card Holders, ..."
1,2,Area and Population,"Total,Child Population 0-6 Years 2011, Rural,C..."
2,3,Rainfall,"2006,Actual Annual Rainfallfrom 2006 to 2016 m..."
3,4,"Agriculture, Horticulture and Sericulture","Paddy,Area under Cereals Hect.,Area under prin..."
4,5,Animal Husbandry,"Artificial Insemination Done Numbers, No.of an..."
5,6,Industries,"Factories,Readymade Garments,Factories, Male,E..."
6,7,Banks,"offices,Public Sector banks, Deposits,Public S..."
7,8,Co-operation and Agricultural Marketing,"Society,Agricultural,Credit Co-operative Socie..."
8,9,TRANSPORT AND COMMUNICATION,"Multiaxled Ariculated Vehicles,Goods vehicles,..."
9,10,Education,"Male,Rural,Literacy rate, Female,Rural,Literac..."


In [24]:
texto_data1=new_df['Description'].tolist()
process_text1=cleaning(texto_data1)

cleaned_sent1 =pd.DataFrame(process_text1)
new_df['new_description']=cleaned_sent1

In [25]:
new_df

Unnamed: 0,Chapter_id,Chapter_name,Description,new_description
0,1,General Information,"Urban,Akshaya with cylinder,Bpl Card Holders, ...",urban akshaya cylinder bpl card holder rural a...
1,2,Area and Population,"Total,Child Population 0-6 Years 2011, Rural,C...",total child population year rural child popula...
2,3,Rainfall,"2006,Actual Annual Rainfallfrom 2006 to 2016 m...",actual annual rainfallfrom mm rainfall mm actu...
3,4,"Agriculture, Horticulture and Sericulture","Paddy,Area under Cereals Hect.,Area under prin...",paddy area cereal hect area principal crop hec...
4,5,Animal Husbandry,"Artificial Insemination Done Numbers, No.of an...",artificial insemination done number animal tre...
5,6,Industries,"Factories,Readymade Garments,Factories, Male,E...",factory readymade garment factory male employe...
6,7,Banks,"offices,Public Sector banks, Deposits,Public S...",office public sector bank deposit public secto...
7,8,Co-operation and Agricultural Marketing,"Society,Agricultural,Credit Co-operative Socie...",society agricultural credit co operative socie...
8,9,TRANSPORT AND COMMUNICATION,"Multiaxled Ariculated Vehicles,Goods vehicles,...",multiaxled ariculated vehicle good vehicle tru...
9,10,Education,"Male,Rural,Literacy rate, Female,Rural,Literac...",male rural literacy rate female rural literacy...


In [26]:
# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data from df_1 and df_2
tfidf_matrix_1 = tfidf_vectorizer.fit_transform(sdg_new['new_targets'])
tfidf_matrix_2 = tfidf_vectorizer.transform(new_df['new_description'])
print(tfidf_matrix_1.shape)
print(tfidf_matrix_2.shape)
# Compute cosine similarity between vectors of df_1 and df_2
similarity_matrix_sdg = cosine_similarity(tfidf_matrix_2, tfidf_matrix_1)

vec_val = new_df['Chapter_id'].tolist()
sdg_val = sdg_new['Target_id'].tolist()

# Convert cosine similarities to DataFrame
similarity_df_sdg = pd.DataFrame(similarity_matrix_sdg, columns=sdg_val, index=vec_val)


(132, 1128)
(18, 1128)


In [27]:
similarity_df_sdg

Unnamed: 0,1.5,1.4,1.1,1.2,1.6,1.3,2.8,2.5,2.3,2.2,...,15.3,15.1,15.5,16.2,16.3,16.5,16.4,16.1,17.1,17.2
1,0.007047,0.008288,0.171263,0.0,0.0,0.0,0.108382,0.018695,0.05049,0.0,...,0.042424,0.005662,0.0,0.0,0.0,0.0,0.043606,0.086743,0.0,0.061911
2,0.0,0.03518,0.233627,0.099438,0.0,0.077764,0.0,0.0,0.043617,0.133145,...,0.033811,0.092193,0.053105,0.0,0.0,0.0,0.046617,0.064046,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0053,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.005664,0.007782,0.0,0.0
4,0.000729,0.042797,0.002601,0.0,0.0,0.000311,0.016828,0.237117,0.066198,0.000208,...,0.0,0.097088,0.0,0.0,0.000384,0.0,0.019424,0.054881,0.002023,0.011585
5,0.002956,0.0,0.009781,0.0,0.002,0.0,0.011211,0.044208,0.043642,0.0,...,0.0,0.00665,0.0,0.003443,0.0,0.008331,0.021478,0.039289,0.011595,0.015423
6,0.0,0.017918,0.011183,0.0,0.0,0.0,0.038823,0.0,0.09239,0.00937,...,0.0,0.0,0.0,0.0,0.0,0.0,0.046934,0.064481,0.009126,0.0
7,0.0,0.0,0.064639,0.0,0.0,0.0,0.0,0.125022,0.003242,0.0,...,0.0,0.0,0.036454,0.0,0.0,0.150753,0.003465,0.00476,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.010866,0.034224,0.054018,0.0,...,0.0,0.0,0.0,0.0,0.0,0.062765,0.010509,0.014438,0.0,0.0
9,0.047083,0.025805,0.032569,0.007572,0.041958,0.0,0.006469,0.007026,0.019388,0.0,...,0.007597,0.0,0.0,0.054853,0.016063,0.049894,0.017457,0.023983,0.008429,0.0
10,0.005827,0.002462,0.034384,0.026839,0.003943,0.034936,0.0,0.0,0.030724,0.123292,...,0.008509,0.018044,0.014419,0.006788,0.0,0.0,0.032838,0.084797,0.035948,0.0


In [28]:
def find_top_n_similarities(similarity_matrix, n, sdg, chapter_df):
    top_n_similarities = []
    top_n_similarity_taget_id = []
    top_n_similarity_goal_id = []
    
    for index, row in similarity_matrix.iterrows():
        top_n_indices = sorted(range(len(row)), key=lambda i: row.values[i], reverse=True)[:n]
        top_n_values = [row.values[i] for i in top_n_indices]
        top_n_target = [similarity_matrix.columns[i] for i in top_n_indices]
        # Fetch Goal_id from sdg DataFrame using Target_id values
        top_n_goals = [sdg['Goal No.'].loc[i] for i in top_n_indices]   
        
        top_n_similarities.append(top_n_values)
        top_n_similarity_taget_id.append(top_n_target)
        top_n_similarity_goal_id.append(top_n_goals)
    
    chapter_df['top_n_similarity_goal_id'] = top_n_similarity_goal_id
    chapter_df['top_n_similarity_target_id'] = top_n_similarity_taget_id
    chapter_df['top_n_similarities'] = top_n_similarities
    
    return chapter_df



In [29]:
# Assuming similarity_matrix is your 18x132 numpy array
n = 3  # Number of top similarities to find
result_lookup = find_top_n_similarities(similarity_df_sdg, n, sdg_new, new_df)

In [30]:
result_lookup

Unnamed: 0,Chapter_id,Chapter_name,Description,new_description,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,1,General Information,"Urban,Akshaya with cylinder,Bpl Card Holders, ...",urban akshaya cylinder bpl card holder rural a...,"[11, 1, 4]","[11.8, 1.1, 4.5]","[0.34235279020445014, 0.17126262859449792, 0.1..."
1,2,Area and Population,"Total,Child Population 0-6 Years 2011, Rural,C...",total child population year rural child popula...,"[11, 1, 4]","[11.8, 1.1, 4.5]","[0.3124410438802492, 0.23362721617713858, 0.18..."
2,3,Rainfall,"2006,Actual Annual Rainfallfrom 2006 to 2016 m...",actual annual rainfallfrom mm rainfall mm actu...,"[8, 8, 11]","[8.2, 8.1, 11.6]","[0.2189005894615654, 0.17804427773071346, 0.14..."
3,4,"Agriculture, Horticulture and Sericulture","Paddy,Area under Cereals Hect.,Area under prin...",paddy area cereal hect area principal crop hec...,"[14, 14, 2]","[14.1, 14.3, 2.5]","[0.31032237266397633, 0.2423746877417431, 0.23..."
4,5,Animal Husbandry,"Artificial Insemination Done Numbers, No.of an...",artificial insemination done number animal tre...,"[2, 8, 12]","[2.6, 8.1, 12.8]","[0.21490884308439262, 0.09165504957739073, 0.0..."
5,6,Industries,"Factories,Readymade Garments,Factories, Male,E...",factory readymade garment factory male employe...,"[8, 8, 4]","[8.5, 8.8, 4.5]","[0.2001482644968775, 0.16061749004769638, 0.10..."
6,7,Banks,"offices,Public Sector banks, Deposits,Public S...",office public sector bank deposit public secto...,"[8, 2, 11]","[8.1, 2.6, 11.8]","[0.2291163440431373, 0.19736557128748836, 0.19..."
7,8,Co-operation and Agricultural Marketing,"Society,Agricultural,Credit Co-operative Socie...",society agricultural credit co operative socie...,"[9, 11, 9]","[9.3, 11.3, 9.4]","[0.18756017389925123, 0.13624164687696164, 0.0..."
8,9,TRANSPORT AND COMMUNICATION,"Multiaxled Ariculated Vehicles,Goods vehicles,...",multiaxled ariculated vehicle good vehicle tru...,"[3, 9, 11]","[3.5, 9.1, 11.2]","[0.34549526725676694, 0.2796769699188529, 0.11..."
9,10,Education,"Male,Rural,Literacy rate, Female,Rural,Literac...",male rural literacy rate female rural literacy...,"[4, 4, 5]","[4.2, 4.8, 5.2]","[0.23501129462933765, 0.15659288063111207, 0.1..."


In [31]:
result_lookup.to_csv('Lookup_2016-17_TFIDF.csv', sep=";", index=False)

Unwrap lists

In [31]:
# Create a list of dictionaries
lookup_rows = []
for chapter_id, data in result_lookup.iterrows():
    for i in range(len(data['top_n_similarity_target_id'])):
        row = {
            'Chapter_id': data['Chapter_id'],
            'Chapter_name': data['Chapter_name'],
            'Description': data['Description'],
            'new_description': data['new_description'],
            'top_n_similarity_goal_id': data['top_n_similarity_goal_id'][i],
            'top_n_similarity_target_id': data['top_n_similarity_target_id'][i],
            'top_n_similarities': data['top_n_similarities'][i]
        }
        lookup_rows.append(row)


In [32]:
# Convert list of dictionaries to DataFrame
final_lookup = pd.DataFrame(lookup_rows)
final_lookup

Unnamed: 0,Chapter_id,Chapter_name,Description,new_description,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,1,General Information,"Urban,Akshaya with cylinder,Bpl Card Holders, ...",urban akshaya cylinder bpl card holder rural a...,11,11.8,0.342353
1,1,General Information,"Urban,Akshaya with cylinder,Bpl Card Holders, ...",urban akshaya cylinder bpl card holder rural a...,1,1.1,0.171263
2,1,General Information,"Urban,Akshaya with cylinder,Bpl Card Holders, ...",urban akshaya cylinder bpl card holder rural a...,4,4.5,0.140161
3,2,Area and Population,"Total,Child Population 0-6 Years 2011, Rural,C...",total child population year rural child popula...,11,11.8,0.312441
4,2,Area and Population,"Total,Child Population 0-6 Years 2011, Rural,C...",total child population year rural child popula...,1,1.1,0.233627
5,2,Area and Population,"Total,Child Population 0-6 Years 2011, Rural,C...",total child population year rural child popula...,4,4.5,0.187173
6,3,Rainfall,"2006,Actual Annual Rainfallfrom 2006 to 2016 m...",actual annual rainfallfrom mm rainfall mm actu...,8,8.2,0.218901
7,3,Rainfall,"2006,Actual Annual Rainfallfrom 2006 to 2016 m...",actual annual rainfallfrom mm rainfall mm actu...,8,8.1,0.178044
8,3,Rainfall,"2006,Actual Annual Rainfallfrom 2006 to 2016 m...",actual annual rainfallfrom mm rainfall mm actu...,11,11.6,0.142337
9,4,"Agriculture, Horticulture and Sericulture","Paddy,Area under Cereals Hect.,Area under prin...",paddy area cereal hect area principal crop hec...,14,14.1,0.310322


In [33]:
final_lookup.shape

(54, 7)

In [81]:
final_lookup.to_csv('Lookup_2016-17_TFIDF_expanded.csv', sep=";", index=False)

In [34]:
similarity_df_sdg_trans = similarity_df_sdg.T

In [35]:
def find_similarity_sdg(similarity_matrix, threshold, chapter_df, sdg):
    top_n_similarities = []
    top_n_similarity_chapter_id = []
    top_n_similarity_chapter_name = []
    n = threshold
    for index, row in similarity_matrix.iterrows():
        top_n_indices = sorted(range(len(row)), key=lambda i: row.values[i], reverse=True)[:n]
        top_n_values = [row.values[i] for i in top_n_indices]
        top_n_chapter = [similarity_matrix.columns[i] for i in top_n_indices]
        top_n_chapter_name = [new_df['Chapter_name'].loc[i] for i in top_n_indices]   
        
        top_n_similarities.append(top_n_values)
        top_n_similarity_chapter_id.append(top_n_chapter)
        top_n_similarity_chapter_name.append(top_n_chapter_name)
    
    sdg['top_n_similarity_chapter_name'] = top_n_similarity_chapter_name
    sdg['top_n_similarity_chapter_id'] = top_n_similarity_chapter_id
    sdg['top_n_similarities'] = top_n_similarities

    return sdg

In [36]:
# Example usage:
threshold_value = 3  # Define the threshold value

# Call the function to find chapter_ids with similarity greater than the threshold
result_sdg = find_similarity_sdg(similarity_df_sdg_trans, threshold_value, new_df, sdg_new)


In [37]:
result_sdg.head(10)

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators,new_targets,top_n_similarity_chapter_name,top_n_similarity_chapter_id,top_n_similarities
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ...",build resilience poor vulnerable situation red...,"[Miscellaneous, TRANSPORT AND COMMUNICATION, H...","[17, 9, 11]","[0.1932098395605113, 0.04708267803029504, 0.03..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par...",ensure men woman particular poor vulnerable eq...,"[Additional information, Health & Family welfa...","[18, 11, 14]","[0.10662218571018729, 0.10449350942751875, 0.0..."
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo...",eradicate extreme poverty people everywhere cu...,"[Area and Population, General Information, Rur...","[2, 1, 14]","[0.23362721617713858, 0.17126262859449792, 0.1..."
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio...",reduce least half proportion men woman child a...,"[Area and Population, Women & Child Developmen...","[2, 13, 14]","[0.09943845049189987, 0.0920107542329232, 0.07..."
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...,create sound policy framework state level base...,"[TRANSPORT AND COMMUNICATION, Additional infor...","[9, 18, 14]","[0.04195787030295731, 0.019643096222707056, 0...."
5,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,1.3,Implement nationally appropriate social protec...,implement nationally appropriate social protec...,"[Area and Population, Women & Child Developmen...","[2, 13, 10]","[0.07776411653324466, 0.07480805662283764, 0.0..."
6,2,"End hunger, achieve food security and improved...",Adopt measures to ensure the proper functionin...,2.8,Adopt measures to ensure the proper functionin...,adopt measure ensure proper functioning food c...,"[General Information, Women & Child Developmen...","[1, 13, 6]","[0.10838182910863638, 0.08903598947286037, 0.0..."
7,2,"End hunger, achieve food security and improved...","By 2020, maintain the genetic diversity of see...",2.5,"By 2020, maintain the genetic diversity of see...",maintain genetic diversity seed cultivated pla...,"[Agriculture, Horticulture and Sericulture, Ba...","[4, 7, 5]","[0.23711734322186412, 0.12502202094793366, 0.0..."
8,2,"End hunger, achieve food security and improved...","By 2030, double the agricultural productivity ...",2.3,"By 2030, double the agricultural productivity ...",double agricultural productivity income small ...,"[Energy, Industries, Agriculture, Horticulture...","[16, 6, 4]","[0.11659539582598054, 0.09238966324735186, 0.0..."
9,2,"End hunger, achieve food security and improved...","By 2030, end all forms of malnutrition, includ...",2.2,"By 2030, end all forms of malnutrition, includ...",end form malnutrition including achieving inte...,"[Women & Child Development, Area and Populatio...","[13, 2, 10]","[0.1482221231943525, 0.13314508366733135, 0.12..."


In [38]:
# Create a dictionary to store chapter details
chapter_details = {}

# Iterate over the rows of the SDG DataFrame
for index, row in result_sdg.iterrows():
    chapter_id = row['top_n_similarity_chapter_id']
    values = row['top_n_similarities']
    goal_no = row['Goal No.']
    target_id = row['Target_id']
    #print(chapter_id, values, goal_no, target_id)
    for ids in range(len(chapter_id)):
        #print(ids)
        
        #print(chapter_id[ids])
        #if index<9:
            #print(chapter_details)
        if chapter_id[ids] not in chapter_details:
            chapter_details[chapter_id[ids]] = { 'Chapter_id' : chapter_id[ids], 'top_n_similarity_goal_id' : [goal_no], 'top_n_similarity_target_id': [target_id], 'top_n_similarities' : [values[ids]] }
            #print(chapter_details[chapter_id[ids]])
        else:
            #print(chapter_details[chapter_id[ids]])
            g_id = chapter_details[chapter_id[ids]].get('top_n_similarity_goal_id')
            t_id = chapter_details[chapter_id[ids]].get('top_n_similarity_target_id')
            val = chapter_details[chapter_id[ids]].get('top_n_similarities')
            g_id.append(goal_no)
            #print(g_id)
            t_id.append(target_id)
            val.append(values[ids])
            #new_g_id = list(set(g_id))
            #print(new_g_id)
            #new_t_id = list(set(t_id))
            chapter_details[chapter_id[ids]].update({'top_n_similarity_goal_id' : g_id})
            chapter_details[chapter_id[ids]].update({'top_n_similarity_target_id' : t_id})
            chapter_details[chapter_id[ids]].update({'top_n_similarities' : val})
            

In [39]:
chapter_details

{17: {'Chapter_id': 17,
  'top_n_similarity_goal_id': [1,
   3,
   3,
   3,
   3,
   3,
   3,
   3,
   7,
   9,
   11,
   13,
   15,
   15,
   17],
  'top_n_similarity_target_id': [1.5,
   3.7,
   3.5,
   3.2,
   3.3,
   3.6,
   3.1,
   3.4,
   7.4,
   9.6,
   11.5,
   13.1,
   15.8,
   15.1,
   17.1],
  'top_n_similarities': [0.1932098395605113,
   0.02995079214019905,
   0.07544139734771449,
   0.19451345548705593,
   0.11008800154609631,
   0.04343671862711139,
   0.15810290484777756,
   0.0778593401366756,
   0.04075547766256775,
   0.008165635064218054,
   0.053515267527032825,
   0.040216199902185355,
   0.3185654559772787,
   0.06772856802928744,
   0.013834127496590064]},
 9: {'Chapter_id': 9,
  'top_n_similarity_goal_id': [1,
   1,
   3,
   4,
   4,
   5,
   5,
   6,
   6,
   9,
   9,
   11,
   11,
   11,
   11,
   11,
   12,
   12,
   12,
   12,
   12,
   13,
   13,
   13,
   14,
   14,
   16,
   16],
  'top_n_similarity_target_id': [1.5,
   1.6,
   3.5,
   4.9,
   4.4,
   5.

In [40]:
# Convert dictionary to list of dictionaries
rows = []
for chapter_id, values in chapter_details.items():
    row = {'Chapter_id': chapter_id,
           'top_n_similarity_goal_id': values['top_n_similarity_goal_id'],
           'top_n_similarity_target_id': values['top_n_similarity_target_id'],
           'top_n_similarities': values['top_n_similarities']}
    rows.append(row)

# Create DataFrame
goals = pd.DataFrame(rows)


In [41]:
goals

Unnamed: 0,Chapter_id,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,17,"[1, 3, 3, 3, 3, 3, 3, 3, 7, 9, 11, 13, 15, 15,...","[1.5, 3.7, 3.5, 3.2, 3.3, 3.6, 3.1, 3.4, 7.4, ...","[0.1932098395605113, 0.02995079214019905, 0.07..."
1,9,"[1, 1, 3, 4, 4, 5, 5, 6, 6, 9, 9, 11, 11, 11, ...","[1.5, 1.6, 3.5, 4.9, 4.4, 5.8, 5.6, 6.3, 6.4, ...","[0.04708267803029504, 0.04195787030295731, 0.3..."
2,11,"[1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 5, 6, 7, ...","[1.5, 1.4, 3.7, 3.5, 3.2, 3.6, 3.1, 3.9, 3.11,...","[0.03234928373956714, 0.10449350942751875, 0.0..."
3,18,"[1, 1, 2, 5, 8, 8, 9, 9, 11, 11, 12, 12, 12, 1...","[1.4, 1.6, 2.4, 5.7, 8.9, 8.3, 9.3, 9.7, 11.3,...","[0.10662218571018729, 0.019643096222707056, 0...."
4,14,"[1, 1, 1, 1, 3, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, ...","[1.4, 1.1, 1.2, 1.6, 3.8, 3.12, 3.11, 4.3, 4.4...","[0.05603106081183347, 0.12796637511542358, 0.0..."
5,2,"[1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 6, 6, ...","[1.1, 1.2, 1.3, 2.2, 2.1, 2.7, 3.7, 3.3, 3.1, ...","[0.23362721617713858, 0.09943845049189987, 0.0..."
6,1,"[1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 8, 9, 9, 9, ...","[1.1, 2.8, 2.7, 3.12, 4.5, 5.4, 6.3, 6.8, 7.3,...","[0.17126262859449792, 0.10838182910863638, 0.0..."
7,13,"[1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, ...","[1.2, 1.3, 2.8, 2.2, 3.6, 3.9, 4.9, 4.3, 4.2, ...","[0.0920107542329232, 0.07480805662283764, 0.08..."
8,10,"[1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, ...","[1.3, 2.2, 2.1, 3.2, 3.4, 3.1, 4.8, 4.1, 4.2, ...","[0.03493581321400276, 0.12329198218525748, 0.0..."
9,6,"[2, 2, 3, 3, 4, 5, 8, 8, 8, 8, 9, 9, 12, 14, 16]","[2.8, 2.3, 3.8, 3.11, 4.5, 5.3, 8.6, 8.5, 8.3,...","[0.03882291582834913, 0.09238966324735186, 0.0..."


In [42]:
# Convert dictionary to list of dictionaries
all_rows = []
for chapter_id, values in chapter_details.items():
    for i in range(len(values['top_n_similarity_goal_id'])):
        row = {'Chapter_id': values['Chapter_id'],
               'top_n_similarity_goal_id': values['top_n_similarity_goal_id'][i],
               'top_n_similarity_target_id': values['top_n_similarity_target_id'][i],
               'top_n_similarities': values['top_n_similarities'][i]}
        all_rows.append(row)


In [43]:
# Create DataFrame
all_goals = pd.DataFrame(all_rows)

In [44]:
all_goals

Unnamed: 0,Chapter_id,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,17,1,1.5,0.193210
1,17,3,3.7,0.029951
2,17,3,3.5,0.075441
3,17,3,3.2,0.194513
4,17,3,3.3,0.110088
...,...,...,...,...
391,12,16,16.1,0.109726
392,3,8,8.2,0.218901
393,3,8,8.1,0.178044
394,3,11,11.6,0.142337


In [45]:
#creating dictionary for movie id
chapter_names_dict = dict(zip(new_df['Chapter_id'], new_df['Chapter_name']))

In [46]:
all_goals['Chapter_name'] = all_goals['Chapter_id'].map(chapter_names_dict.get)

In [47]:
goal_name = dict(zip(sdg_new['Goal No.'], sdg_new['Goal']))

In [48]:
target_name = dict(zip(sdg_new['Target_id'], sdg_new['Targets']))

In [51]:
all_goals['Goal'] = all_goals['top_n_similarity_goal_id'].map(goal_name.get)

In [52]:
all_goals['Targets'] = all_goals['top_n_similarity_target_id'].map(target_name.get)

In [53]:
all_goals.columns

Index(['Chapter_id', 'top_n_similarity_goal_id', 'top_n_similarity_target_id',
       'top_n_similarities', 'Chapter_name', 'Goal', 'Targets'],
      dtype='object')

In [54]:
all_goals = all_goals.loc[:,['Chapter_id', 'Chapter_name', 'Goal', 'Targets', 'top_n_similarity_goal_id', 'top_n_similarity_target_id','top_n_similarities']]

In [55]:
all_goals

Unnamed: 0,Chapter_id,Chapter_name,Goal,Targets,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,17,Miscellaneous,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1,1.5,0.193210
1,17,Miscellaneous,Ensure healthy lives and promote well-being fo...,"Achieve universal health coverage, including f...",3,3.7,0.029951
2,17,Miscellaneous,Ensure healthy lives and promote well-being fo...,"By 2020, halve the number of deaths and injuri...",3,3.5,0.075441
3,17,Miscellaneous,Ensure healthy lives and promote well-being fo...,"By 2030, end preventable deaths of newborns an...",3,3.2,0.194513
4,17,Miscellaneous,Ensure healthy lives and promote well-being fo...,"By 2030, end the epidemics of AIDS, tuberculos...",3,3.3,0.110088
...,...,...,...,...,...,...,...
391,12,Social Welfare,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16,16.1,0.109726
392,3,Rainfall,"Promote sustained, inclusive and sustainable e...",Achieve higher levels of economic productivity...,8,8.2,0.218901
393,3,Rainfall,"Promote sustained, inclusive and sustainable e...",Sustain per capita economic growth in accordan...,8,8.1,0.178044
394,3,Rainfall,"Make cities and human settlements inclusive, s...","By 2030, reduce the adverse per capita environ...",11,11.6,0.142337


In [56]:
all_goals.to_csv('Lookup_2016-17_all_goals.csv', sep=";", index=False)

In [62]:
#let's create the vectorizer and fit the corpus and transform them accordingly
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [63]:
#let's print the vocabulary

print(v.vocabulary_)



In [64]:
#let's print the idf of each word:

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    
    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)
    
    #get the score
    idf_score = v.idf_[indx]
    
    print(f"{word} : {idf_score}")

abuse : 4.799227511282801
accelerated : 5.204692619390966
accepted : 5.204692619390966
access : 2.463852595465765
accessible : 4.799227511282801
accident : 5.204692619390966
accordance : 3.2587824703356527
according : 4.799227511282801
account : 4.511545438831021
accountable : 5.204692619390966
achieve : 3.332890442489375
achieving : 4.511545438831021
acquire : 5.204692619390966
across : 4.799227511282801
action : 3.700615222614692
activity : 4.799227511282801
adaptation : 4.288401887516811
adapted : 5.204692619390966
adaptive : 5.204692619390966
added : 4.106080330722856
addition : 4.799227511282801
address : 4.511545438831021
adequate : 4.288401887516811
adolescent : 4.799227511282801
adopt : 4.106080330722856
adopted : 5.204692619390966
adopting : 5.204692619390966
adoption : 4.799227511282801
adult : 4.288401887516811
advance : 5.204692619390966
advanced : 5.204692619390966
adverse : 4.799227511282801
affect : 5.204692619390966
affected : 4.511545438831021
affirms : 5.2046926193909

revenue : 5.204692619390966
review : 5.204692619390966
right : 3.8183982582710754
risk : 4.106080330722856
river : 5.204692619390966
road : 4.511545438831021
round : 4.799227511282801
rti : 5.204692619390966
rule : 5.204692619390966
run : 5.204692619390966
rural : 4.106080330722856
safe : 3.499944527152541
safeguard : 4.799227511282801
safely : 4.511545438831021
safety : 5.204692619390966
sanitation : 4.288401887516811
satisfied : 4.799227511282801
scale : 4.511545438831021
scarcity : 5.204692619390966
scheme : 5.204692619390966
scholarship : 5.204692619390966
school : 4.799227511282801
science : 4.511545438831021
scientific : 4.511545438831021
season : 5.204692619390966
seat : 5.204692619390966
secondary : 4.288401887516811
sector : 3.951929650895598
secure : 4.511545438831021
seed : 5.204692619390966
seized : 5.204692619390966
selection : 5.204692619390966
sendai : 5.204692619390966
sensitive : 4.799227511282801
service : 2.7623455840217614
settlement : 4.511545438831021
severe : 5.2

In [65]:
#let's print the transformed output from tf-idf
print(transform_output.shape)

(133, 1128)


In [66]:
type(transform_output)

scipy.sparse._csr.csr_matrix

In [29]:
sdg_new_copy = sdg_new.copy()

In [30]:
y = sdg_new_copy[['Goal No.']]

In [32]:
from sklearn.model_selection import GridSearchCV #importing GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB


In [48]:
# Create transformers for text features
text_transformer = TfidfVectorizer()

# Create a column transformer to apply transformers to text and numerical columns only
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'new_targets')],
    remainder='drop'  # Drop any other columns not specified above
)

# Create a pipeline with the column transformer and Multinomial Naive Bayes
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', MultinomialNB())
])

# Define the parameter grid with different values for max_features
param_grid = {
    'preprocessor__text__max_features': [500, 1000, 5000],  
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(sdg_new_copy, y)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [49]:
print("Best Max Features:", grid_search.best_params_['preprocessor__text__max_features'])
print("Best Accuracy:", grid_search.best_score_)

Best Max Features: 500
Best Accuracy: 0.41396011396011395


In [51]:
# Create transformers for text features with specified max features
text_transformer1 = TfidfVectorizer(max_features=500)

# Create a column transformer to apply transformers to text and numerical columns only
preprocessor1 = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'new_targets')],
    remainder='drop' # Drop any other columns not specified above
)

# Create a pipeline with the column transformer and Logistic Regression
pipeline1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('clf', LogisticRegression(solver='saga', max_iter=1000))
])

# Define the parameter grid with different values for C (regularization parameter)
param_grid1 = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]  
}

# Perform grid search with cross-validation
grid_search1 = GridSearchCV(pipeline1, param_grid1, cv=5, scoring='accuracy')
grid_search1.fit(sdg_new_copy, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


In [52]:
print("Best Parameters:", grid_search1.best_params_)        
print("Best Accuracy:", grid_search1.best_score_)           

Best Parameters: {'clf__C': 10}
Best Accuracy: 0.5037037037037038


In [30]:
sdg_new['vec'] = sdg_new['tokens'].apply(sent_vec)

In [31]:
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators,new_targets,tokens,vec
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ...","[2030, build, resilience, poor, vulnerable, si...","[situation, vulnerable, miss, extreme, evacuat...","[0.05813743954613095, 0.062311808268229164, -0..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par...","[2030, ensure, man, woman, particular, poor, v...","[woman, economic, appropriate, microfinance, p...","[0.07795173891129033, 0.0019354051159274194, -..."
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo...","[2030, eradicate, extreme, poverty, people, cu...","[population, international, day, sex, eradicat...","[0.021107549252717392, 0.0068206787109375, -0...."
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio...","[2030, reduce, half, proportion, man, woman, c...","[woman, population, sex, dimension, disaggrega...","[0.05279862253289474, 0.02122096011513158, 0.0..."
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...,"[create, sound, policy, framework, state, leve...","[multilateral, sustainably, policy, eradicate,...","[-0.0144439697265625, 0.039288838704427086, 0...."
...,...,...,...,...,...,...,...,...
128,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Fully operationalize the technology bank and s...,"[fully, operationalize, technology, bank, scie...","[enhance, innovation, enable, science, capacit...","[-0.01899157072368421, -0.030575400904605265, ..."
129,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,"Promote the development, transfer, disseminati...","[promote, development, transfer, dissemination...","[funding, promote, environmentally, disseminat...","[-0.04981231689453125, 0.037671407063802086, 0..."
130,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,"Strengthen domestic resource mobilization, inc...","[strengthen, domestic, resource, mobilization,...","[international, capacity, percentage, domestic...","[-0.04270765516493055, 0.016059027777777776, 0..."
131,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",7.1,"By 2020, enhance capacity-building support to ...","[2020, enhance, capacity, build, support, incr...","[enhance, disaggregation, reliable, target, re...","[-0.012115478515625, -0.04031553722563244, -0...."


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Sample DataFrames with text columns
data_1 = {
    'id': [1, 2, 3],
    'new_target': ['This is the first document.', 'This document is the second document.', 'And this is the third one.']
}

data_2 = {
    'id': [4, 5, 6],
    'description': ['This is the fourth document.', 'This document is the fifth document.', 'And this is the sixth one.']
}

df_1 = pd.DataFrame(data_1)
df_2 = pd.DataFrame(data_2)

# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data from df_1 and df_2
tfidf_matrix_1 = tfidf_vectorizer.fit_transform(df_1['new_target'])
tfidf_matrix_2 = tfidf_vectorizer.transform(df_2['description'])

# Compute cosine similarity between vectors of df_1 and df_2
cosine_similarities = cosine_similarity(tfidf_matrix_1, tfidf_matrix_2)

# Convert cosine similarities to DataFrame
cosine_similarities_df = pd.DataFrame(cosine_similarities, columns=df_2['id'], index=df_1['id'])

print(cosine_similarities_df)


id         4         5         6
id                              
1   0.786785  0.741891  0.370065
2   0.827774  0.877865  0.287134
3   0.408115  0.283804  0.867682


In [52]:
import pandas as pd

# Sample dictionary
data = {
    17: {'Chapter_id': 17, 'top_n_similarity_goal_id': [1, 3, 3], 'top_n_similarity_target_id': [1.5, 3.7, 3.5]},
    9: {'Chapter_id': 9, 'top_n_similarity_goal_id': [1, 1, 3], 'top_n_similarity_target_id': [1.5, 1.6, 3.5]}
}

# Convert dictionary to list of dictionaries
rows = []
for chapter_id, values in data.items():
    for i in range(len(values['top_n_similarity_goal_id'])):
        row = {'Chapter_id': values['Chapter_id'],
               'top_n_similarity_goal_id': values['top_n_similarity_goal_id'][i],
               'top_n_similarity_target_id': values['top_n_similarity_target_id'][i]}
        rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows)
print(df)


   Chapter_id  top_n_similarity_goal_id  top_n_similarity_target_id
0          17                         1                         1.5
1          17                         3                         3.7
2          17                         3                         3.5
3           9                         1                         1.5
4           9                         1                         1.6
5           9                         3                         3.5
