In [30]:
#Connecting to my google drive for accessing dataset
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
DATADIR = os.path.join(MOUNTPOINT, 'My Drive', 'Project')
path=os.path.join(DATADIR, 'undata.csv')
drive.mount(MOUNTPOINT,force_remount=True)

Mounted at /content/gdrive


In [31]:
#importing common python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

#importing required libraries from nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

#importing required libraries from sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

#Hiding all the unneccesary warnings
import warnings
warnings.filterwarnings("ignore")

In [32]:
#Loading UN sustainable development goals dataset
df = pd.read_csv(path, header=0,index_col=0)
df.head()

Unnamed: 0_level_0,text_id,text,sdg,labels_negative,labels_positive,agreement
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"From a gender perspective, Paulgaard points ou...",5,1,7,0.75
10.18356/eca72908-en,00028349a7f9b2485ff344ae44ccfd6b,Labour legislation regulates maximum working h...,11,2,1,0.333333
10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,6,0.714286
10.1787/5k9b7bn5qzvd-en,0006a887475ccfa5a7f5f51d4ac83d02,The extent to which they are akin to corruptio...,3,1,2,0.333333
10.1787/9789264258211-6-en,0006d6e7593776abbdf4a6f985ea6d95,A region reporting a higher rate will not earn...,3,2,2,0.0


The dataset contains so many entries for which value of negative labels is higher than value of positive labels. Even the agreement value is very less. Such entries are good for training the model as they actually decrease the accuracy of the model. So we are going to take only those entries where agreement value is at least 0.6 and positive labels value > negative labels value.

In [34]:
#Keeping only the texts whose suggested sdg labels is accepted and the agreement score is at least 0.6
print('Shape before:', df.shape)
df = df.query('agreement >= .6 and labels_positive > labels_negative').copy()
print('Shape after :', df.shape)
display(df.head())

Shape before: (17233, 6)
Shape after : (17233, 6)


Unnamed: 0_level_0,text_id,text,sdg,labels_negative,labels_positive,agreement
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"From a gender perspective, Paulgaard points ou...",5,1,7,0.75
10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,6,0.714286
10.1787/9789264117563-8-en,000bfb17e9f3a00d4515ab59c5c487e7,The Israel Oceanographic and Limnological Rese...,6,0,3,1.0
10.18356/805b1ae4-en,001180f5dd9a821e651ed51e30d0cf8c,Previous chapters have discussed ways to make ...,2,0,3,1.0
10.1787/9789264310278-en,001f1aee4013cb098da17a979c38bc57,Prescription rates appear to be higher where l...,8,0,3,1.0


In [35]:
#Checking general information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17233 entries, 10.6027/9789289342698-7-en to 10.18356/39dd1e2e-en
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text_id          17233 non-null  object 
 1   text             17233 non-null  object 
 2   sdg              17233 non-null  int64  
 3   labels_negative  17233 non-null  int64  
 4   labels_positive  17233 non-null  int64  
 5   agreement        17233 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 942.4+ KB


Since all the columns have equal number of non-null values, we are good to go ahead, but we do not need all the columns for modeling, in the next step we will be taking only those columns that are required.

In [36]:
#Taking only two columns - sdg and text
text_df = df[['sdg', 'text']]
text_df.head()

Unnamed: 0_level_0,sdg,text
doi,Unnamed: 1_level_1,Unnamed: 2_level_1
10.6027/9789289342698-7-en,5,"From a gender perspective, Paulgaard points ou..."
10.1787/9789264289062-4-en,3,The average figure also masks large difference...
10.1787/9789264117563-8-en,6,The Israel Oceanographic and Limnological Rese...
10.18356/805b1ae4-en,2,Previous chapters have discussed ways to make ...
10.1787/9789264310278-en,8,Prescription rates appear to be higher where l...


In [37]:
#Printing a random entry from text column.
text_df['text'][3]

'Previous chapters have discussed ways to make food systems more supportive of food security and better nutrition. Nutrition-sensitive food systems can give consumers better options, but ultimately it is consumers who choose what they eat. What consumers choose to eat influences their own nutritional outcomes and sends signals back through the food system - to retailers, processors and producers - that shape both what is produced and how sustainably it is produced.'

In [38]:
#Above we saw that text is an object type column. So we want to change it to string type for future usage.
text_df['text'] = text_df['text'].astype(str)

In [40]:
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
#Function for removing punctuations from the texts.
def remove_punctuation(description):
    """The function to remove punctuation"""
    table = str.maketrans('', '', string.punctuation)
    return description.translate(table)

#Function for removing the stopwords from the texts.
def remove_stopwords(text):
    """The function to removing stopwords"""
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

#Function for stemming the words to their root word.
def stemmer(stem_text):
    """The function to apply stemming"""
    stem_text = [porter.stem(word) for word in stem_text.split()]
    return " ".join(stem_text)

In [42]:
#Applying above functions on text column 
text_df['text'] = text_df['text'].apply(remove_punctuation)
text_df['text'] = text_df['text'].apply(remove_stopwords)
text_df['text'] = text_df['text'].apply(stemmer)
text_df.head()

Unnamed: 0_level_0,sdg,text
doi,Unnamed: 1_level_1,Unnamed: 2_level_1
10.6027/9789289342698-7-en,5,gender perspect paulgaard point labour market ...
10.1787/9789264289062-4-en,3,averag figur also mask larg differ across regi...
10.1787/9789264117563-8-en,6,israel oceanograph limnolog research station m...
10.18356/805b1ae4-en,2,previou chapter discuss way make food system s...
10.1787/9789264310278-en,8,prescript rate appear higher labour forc parti...


In [43]:
#Let's see a random entry from the text column after the transformation.
text_df['text'][1]

'averag figur also mask larg differ across region kazakhstan number annual contact rang 20 astana 97 mangystau part popul like limit access primari care addit poor coverag outpati prescript medicin limit effect appeal care phc level'

In [44]:
#Splitting the data into train and test sets
X = text_df['text']
y = text_df['sdg']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((12924,), (4309,), (12924,), (4309,))

In [45]:
#Logistic Regression model
model_log = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                      ('tfidf', TfidfTransformer()),
                      ('model',LogisticRegression()),
                     ])

model_log.fit(X_train, y_train)

ytest = np.array(y_test)
pred = model_log.predict(X_test)
print('accuracy %s' % accuracy_score(pred, y_test))
print(classification_report(ytest, pred))

accuracy 0.872360176375029
              precision    recall  f1-score   support

           1       0.83      0.85      0.84       296
           2       0.82      0.82      0.82       199
           3       0.89      0.94      0.92       434
           4       0.93      0.96      0.94       603
           5       0.92      0.97      0.95       558
           6       0.91      0.93      0.92       346
           7       0.87      0.94      0.90       395
           8       0.72      0.70      0.71       214
           9       0.71      0.75      0.73       165
          10       0.77      0.37      0.50       117
          11       0.84      0.87      0.86       300
          12       0.77      0.34      0.47        71
          13       0.87      0.86      0.87       296
          14       0.94      0.90      0.92       179
          15       0.90      0.76      0.83       136

    accuracy                           0.87      4309
   macro avg       0.85      0.80      0.81      4309

In [46]:
#SGD Classifier model
pipeline = Pipeline(
            [
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(
                    loss='modified_huber',
                    penalty='l2',
                    alpha=1e-3,
                    random_state=42,
                    max_iter=100,
                    tol=None,
                )),
            ]
        )
classifier = pipeline.fit(X_train, y_train)
ytest = np.array(y_test)
y_pred = classifier.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(ytest, y_pred))

accuracy 0.875609190067301
              precision    recall  f1-score   support

           1       0.83      0.85      0.84       296
           2       0.81      0.84      0.82       199
           3       0.90      0.95      0.93       434
           4       0.92      0.97      0.94       603
           5       0.91      0.97      0.94       558
           6       0.91      0.94      0.92       346
           7       0.87      0.93      0.90       395
           8       0.72      0.67      0.69       214
           9       0.77      0.73      0.75       165
          10       0.76      0.38      0.51       117
          11       0.88      0.87      0.87       300
          12       0.77      0.42      0.55        71
          13       0.89      0.85      0.87       296
          14       0.94      0.94      0.94       179
          15       0.87      0.78      0.82       136

    accuracy                           0.88      4309
   macro avg       0.85      0.81      0.82      4309

In [47]:
#Multinomial Naive Bayes model
nbc = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
               ('tfidf', TfidfTransformer()),
               ('model',MultinomialNB()),
               ])

nbc.fit(X_train, y_train)
ytest = np.array(y_test)
pred_y = nbc.predict(X_test)
print('accuracy %s' % accuracy_score(pred_y, y_test))
print(classification_report(ytest, pred_y))

accuracy 0.7203527500580181
              precision    recall  f1-score   support

           1       0.84      0.66      0.74       296
           2       0.93      0.47      0.62       199
           3       0.85      0.93      0.89       434
           4       0.71      0.97      0.82       603
           5       0.55      0.99      0.71       558
           6       0.77      0.87      0.82       346
           7       0.61      0.95      0.74       395
           8       0.83      0.02      0.05       214
           9       1.00      0.06      0.11       165
          10       0.00      0.00      0.00       117
          11       0.86      0.71      0.78       300
          12       0.00      0.00      0.00        71
          13       0.88      0.77      0.82       296
          14       0.96      0.60      0.74       179
          15       1.00      0.24      0.38       136

    accuracy                           0.72      4309
   macro avg       0.72      0.55      0.55      430

In [48]:
#Random Forest Classifier
rf = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
               ('tfidf', TfidfTransformer()),
               ('rf', RandomForestClassifier(n_estimators=50)),
               ])

rf.fit(X_train, y_train)
ytest = np.array(y_test)
preds = rf.predict(X_test)
print('accuracy %s' % accuracy_score(preds, y_test))
print(classification_report(ytest, preds))

accuracy 0.8275702019029937
              precision    recall  f1-score   support

           1       0.79      0.82      0.81       296
           2       0.75      0.80      0.78       199
           3       0.89      0.94      0.91       434
           4       0.86      0.96      0.90       603
           5       0.87      0.96      0.91       558
           6       0.85      0.92      0.88       346
           7       0.79      0.93      0.85       395
           8       0.63      0.50      0.55       214
           9       0.71      0.53      0.61       165
          10       0.76      0.27      0.40       117
          11       0.80      0.76      0.78       300
          12       0.92      0.15      0.27        71
          13       0.83      0.82      0.83       296
          14       0.91      0.89      0.90       179
          15       0.89      0.66      0.76       136

    accuracy                           0.83      4309
   macro avg       0.82      0.73      0.74      430

In [49]:
#Gradient Boosting Classifier
model_gb = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer()),
                    ('gb', GradientBoostingClassifier(n_estimators=50)),
                    ])

model_gb.fit(X_train, y_train)
ytest = np.array(y_test)
predicted = model_gb.predict(X_test)
print('accuracy %s' % accuracy_score(predicted, y_test))
print(classification_report(ytest, predicted))

accuracy 0.8410304014852634
              precision    recall  f1-score   support

           1       0.81      0.79      0.80       296
           2       0.77      0.81      0.79       199
           3       0.77      0.91      0.84       434
           4       0.93      0.94      0.93       603
           5       0.92      0.95      0.94       558
           6       0.87      0.92      0.90       346
           7       0.88      0.90      0.89       395
           8       0.65      0.65      0.65       214
           9       0.72      0.65      0.68       165
          10       0.56      0.38      0.46       117
          11       0.84      0.77      0.81       300
          12       0.58      0.44      0.50        71
          13       0.87      0.84      0.86       296
          14       0.92      0.88      0.90       179
          15       0.86      0.75      0.80       136

    accuracy                           0.84      4309
   macro avg       0.80      0.77      0.78      430

In [50]:
#Calculating accuracy of all the models 
log_acc = accuracy_score(pred, y_test)
svm_acc = accuracy_score(y_pred, y_test)
nb_acc = accuracy_score(pred_y, y_test)
rf_acc = accuracy_score(preds, y_test)
gb_acc = accuracy_score(predicted, y_test)

In [51]:
#Creating a new dataframe containing all the model names and their corresponding accuracies.
models = pd.DataFrame({
                      'Model': ['Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'Random Forest', 'Gradient Boosting', ],
                      'Score': [log_acc, svm_acc, nb_acc, rf_acc, gb_acc]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
1,SGD Classifier,0.875609
0,Logistic Regression,0.87236
4,Gradient Boosting,0.84103
3,Random Forest,0.82757
2,Naive Bayes,0.720353


From the above table we can see that SGD Classifier gives the highest accuracy for UN data, so we will be using SGD classifier for the final modeling.