In [15]:
import json
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

# Load SDG keywords and test dataset
with open('Data/sdg_keywords.json', 'r') as file:
    sdg_keywords = json.load(file)

df = pd.read_csv('Data/train_large.csv')

texts = df['Text'].tolist()
labels = df.iloc[:, 1:].values

In [2]:
# Load pre-trained Word2Vec model (from google)
model = KeyedVectors.load_word2vec_format('Data/Word2VecPretrained/GoogleNews-vectors-negative300.bin', binary=True)

In [16]:
#Text to Vector Function
def text_to_avg_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    
    if not word_vectors:
        return np.zeros(model.vector_size)
    
    avg_vector = np.mean(word_vectors, axis=0)
    return avg_vector

In [17]:
# Convert each text in the dataset to its corresponding averaged word vector
X = np.array([text_to_avg_vector(text, model) for text in texts])

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


In [19]:

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
classifier = MultiOutputClassifier(RandomForestClassifier()).fit(X_train, Y_train)


## Random Forest

In [118]:
classifier = MultiOutputClassifier(RandomForestClassifier(max_depth=3, criterion='entropy', n_estimators=500, class_weight='balanced_subsample', random_state=5))
classifier.fit(X_train, Y_train)


In [119]:
from sklearn.metrics import classification_report

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=sdg_keywords.keys()))

              precision    recall  f1-score   support

       SDG 1       0.50      0.17      0.25        12
      SDG 10       0.60      0.43      0.50        14
       SDG 3       0.68      0.60      0.64        65
      SDG 11       0.61      0.74      0.67        77
      SDG 12       0.67      0.45      0.54        22
      SDG 13       0.50      0.54      0.52        26
      SDG 14       0.65      0.45      0.53        29
      SDG 15       0.33      0.57      0.42        42
      SDG 16       0.46      0.57      0.51        42
      SDG 17       0.42      0.70      0.53        50
       SDG 2       0.46      0.67      0.55        45
       SDG 4       0.42      0.55      0.48        49
       SDG 5       0.52      0.61      0.56        23
       SDG 6       0.50      0.17      0.25        18
       SDG 7       0.46      0.55      0.50        22
       SDG 8       0.47      0.67      0.55        48
       SDG 9       0.34      0.50      0.40        38

   micro avg       0.48   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Gradient Boosting

In [64]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = MultiOutputClassifier(GradientBoostingClassifier(loss="exponential", learning_rate=0.1, n_estimators=200, criterion="squared_error" ))
classifier.fit(X_train, Y_train)


In [65]:
from sklearn.metrics import classification_report

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=sdg_keywords.keys()))

              precision    recall  f1-score   support

       SDG 1       0.00      0.00      0.00        12
      SDG 10       0.25      0.07      0.11        14
       SDG 3       0.85      0.54      0.66        65
      SDG 11       0.85      0.45      0.59        77
      SDG 12       0.30      0.14      0.19        22
      SDG 13       0.75      0.46      0.57        26
      SDG 14       0.75      0.31      0.44        29
      SDG 15       0.75      0.14      0.24        42
      SDG 16       0.82      0.21      0.34        42
      SDG 17       0.64      0.18      0.28        50
       SDG 2       0.60      0.40      0.48        45
       SDG 4       0.66      0.39      0.49        49
       SDG 5       0.89      0.35      0.50        23
       SDG 6       0.25      0.06      0.09        18
       SDG 7       0.40      0.18      0.25        22
       SDG 8       0.60      0.25      0.35        48
       SDG 9       0.71      0.13      0.22        38

   micro avg       0.68   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## ADABoost

In [39]:
from sklearn.ensemble import AdaBoostClassifier
classifier = MultiOutputClassifier(AdaBoostClassifier(learning_rate=0.1, n_estimators=500 ))
classifier.fit(X_train, Y_train)
from sklearn.metrics import classification_report

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=sdg_keywords.keys()))

              precision    recall  f1-score   support

       SDG 1       0.84      0.26      0.40        99
      SDG 10       0.75      0.25      0.38       153
       SDG 3       0.77      0.63      0.69       499
      SDG 11       0.80      0.53      0.64       184
      SDG 12       0.87      0.59      0.70       130
      SDG 13       0.84      0.56      0.67       135
      SDG 14       0.86      0.54      0.67       129
      SDG 15       0.70      0.13      0.21       221
      SDG 16       0.74      0.15      0.26       207
      SDG 17       0.72      0.18      0.29       195
       SDG 2       0.80      0.24      0.37       272
       SDG 4       0.72      0.33      0.45       161
       SDG 5       0.77      0.48      0.59       133
       SDG 6       0.71      0.34      0.46       103
       SDG 7       0.79      0.34      0.48       143
       SDG 8       0.73      0.33      0.46       299
       SDG 9       0.82      0.23      0.35       306

   micro avg       0.78   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## KNN 

In [37]:
from sklearn.neighbors import KNeighborsClassifier
classifier = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=4, weights='distance'))
classifier.fit(X_train, Y_train)

from sklearn.metrics import classification_report

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=sdg_keywords.keys()))

              precision    recall  f1-score   support

       SDG 1       0.64      0.55      0.59        99
      SDG 10       0.61      0.52      0.56       153
       SDG 3       0.74      0.70      0.72       499
      SDG 11       0.66      0.76      0.70       184
      SDG 12       0.78      0.65      0.71       130
      SDG 13       0.71      0.75      0.73       135
      SDG 14       0.64      0.64      0.64       129
      SDG 15       0.55      0.53      0.54       221
      SDG 16       0.51      0.46      0.49       207
      SDG 17       0.55      0.52      0.54       195
       SDG 2       0.65      0.54      0.59       272
       SDG 4       0.52      0.61      0.56       161
       SDG 5       0.56      0.68      0.61       133
       SDG 6       0.73      0.57      0.64       103
       SDG 7       0.73      0.57      0.64       143
       SDG 8       0.65      0.54      0.59       299
       SDG 9       0.60      0.56      0.58       306

   micro avg       0.64   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [35]:
from sklearn.svm import SVC
classifier = MultiOutputClassifier(SVC(C=50, kernel="rbf" ))
classifier.fit(X_train, Y_train)

from sklearn.metrics import classification_report

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=sdg_keywords.keys()))

              precision    recall  f1-score   support

       SDG 1       0.63      0.33      0.44        99
      SDG 10       0.69      0.49      0.57       153
       SDG 3       0.75      0.65      0.70       499
      SDG 11       0.74      0.65      0.69       184
      SDG 12       0.80      0.72      0.76       130
      SDG 13       0.73      0.67      0.70       135
      SDG 14       0.76      0.61      0.68       129
      SDG 15       0.64      0.36      0.46       221
      SDG 16       0.63      0.29      0.40       207
      SDG 17       0.67      0.45      0.54       195
       SDG 2       0.76      0.50      0.60       272
       SDG 4       0.59      0.45      0.51       161
       SDG 5       0.76      0.63      0.69       133
       SDG 6       0.68      0.50      0.57       103
       SDG 7       0.67      0.50      0.57       143
       SDG 8       0.66      0.52      0.58       299
       SDG 9       0.64      0.43      0.51       306

   micro avg       0.70   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Naive Bayes

In [36]:
from sklearn.naive_bayes import GaussianNB
classifier = MultiOutputClassifier(GaussianNB())
classifier.fit(X_train, Y_train)

from sklearn.metrics import classification_report

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=sdg_keywords.keys()))

              precision    recall  f1-score   support

       SDG 1       0.08      0.87      0.15        99
      SDG 10       0.27      0.80      0.40       153
       SDG 3       0.39      0.56      0.46       499
      SDG 11       0.13      0.77      0.22       184
      SDG 12       0.09      0.81      0.17       130
      SDG 13       0.10      0.84      0.18       135
      SDG 14       0.09      0.76      0.16       129
      SDG 15       0.15      0.76      0.25       221
      SDG 16       0.14      0.77      0.24       207
      SDG 17       0.14      0.82      0.24       195
       SDG 2       0.16      0.73      0.27       272
       SDG 4       0.13      0.89      0.23       161
       SDG 5       0.10      0.82      0.18       133
       SDG 6       0.08      0.84      0.14       103
       SDG 7       0.09      0.73      0.16       143
       SDG 8       0.17      0.74      0.28       299
       SDG 9       0.22      0.83      0.35       306

   micro avg       0.14   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## MLP

In [25]:
from sklearn.neural_network import MLPClassifier
classifier = MultiOutputClassifier(MLPClassifier(hidden_layer_sizes=300, max_iter=250, random_state=30))
classifier.fit(X_train, Y_train)

from sklearn.metrics import classification_report

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=sdg_keywords.keys()))



              precision    recall  f1-score   support

       SDG 1       0.72      0.51      0.60        99
      SDG 10       0.74      0.60      0.66       153
       SDG 3       0.75      0.74      0.75       499
      SDG 11       0.74      0.79      0.76       184
      SDG 12       0.78      0.80      0.79       130
      SDG 13       0.76      0.71      0.73       135
      SDG 14       0.76      0.71      0.73       129
      SDG 15       0.70      0.56      0.62       221
      SDG 16       0.63      0.53      0.58       207
      SDG 17       0.64      0.50      0.56       195
       SDG 2       0.72      0.61      0.66       272
       SDG 4       0.61      0.57      0.59       161
       SDG 5       0.75      0.80      0.77       133
       SDG 6       0.66      0.61      0.63       103
       SDG 7       0.66      0.62      0.64       143
       SDG 8       0.63      0.64      0.64       299
       SDG 9       0.63      0.57      0.60       306

   micro avg       0.70   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
