In [1]:
import tensorflow as tf

In [2]:
tf.config.list_physical_devices('GPU')

[]

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [4]:
# %ls /gdrive/Shareddrives/

In [5]:
# %cd /gdrive/MyDrive/datasets/

In [6]:
#%cd /gdrive/MyDrive/2023_DSKUS/EDA 
%cd /gdrive/MyDrive/DSKU/datasets/

/gdrive/MyDrive/DSKU/datasets


In [7]:
%ls 

 Cambodia-Thailand-Malaysia.csv
 Cambodia-Thailand-Malaysia.xlsx
 gender_submission.csv
 Inflation-Consumer-Prices-dependent-var.csv
 Inflation-Consumer-Prices-dependent-var.txt
 Inflation-independent-variables.csv
 Inflation_independent_variables_LatinAmerica.csv
 Inflation_independent_variables.xlsx
 results.csv
 test.csv
 train.csv
'us_foreign_aid_complete (1).csv'
 us_foreign_aid_complete.csv
 WorldDevelopmentIndicators.csv


In [8]:
%pwd

'/gdrive/MyDrive/DSKU/datasets'

In [9]:
# load all the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings

warnings.filterwarnings("ignore")

In [10]:
def readCsvFile(filepath):
    """
        Function to read csv file
        args:
            filepath : path to csv file
        return:
            df : dataframe
    """
    
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    
    df.drop(df.columns[-1], inplace=True, axis = 1)
    
    return df

In [11]:
raw_us_aid = readCsvFile('us_foreign_aid_complete.csv')

In [12]:
raw_us_aid.head(2)

Unnamed: 0,country_id,country_code,country_name,region_id,region_name,income_group_id,income_group_name,income_group_acronym,managing_agency_id,managing_agency_acronym,...,activity_end_date,transaction_type_id,transaction_type_name,fiscal_year,transaction_date,current_dollar_amount,constant_dollar_amount,submission_activity_id,finance_type,dac_flow_id
0,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,1,USAID,...,,2,Obligations,2006,01MAR2006,37760,47793,30831,110,1
1,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,1,USAID,...,,3,Disbursements,2006,01MAR2006,983,1244,30831,110,1


In [13]:
raw_us_aid['fiscal_year'] = np.where(raw_us_aid.fiscal_year=="1976tq", "1976", raw_us_aid.fiscal_year)

In [14]:
raw_us_aid['fiscal_year'] = raw_us_aid['fiscal_year'].astype(int)

In [15]:
# consider latest government
latest_foreign_aid = raw_us_aid[(raw_us_aid.fiscal_year>=2020) & (raw_us_aid.fiscal_year<2023)]

In [16]:
latest_foreign_aid.groupby(['country_code']).agg({'activity_description':'nunique'}).reset_index()

Unnamed: 0,country_code,activity_description
0,ABW,2
1,AFG,2
2,AGO,108
3,AIA,1
4,ALB,426
...,...,...
217,WSM,16
218,YEM,212
219,ZAF,706
220,ZMB,501


In [17]:
latest_foreign_aid[latest_foreign_aid.duplicated(['country_code', 'activity_description'])]

Unnamed: 0,country_id,country_code,country_name,region_id,region_name,income_group_id,income_group_name,income_group_acronym,managing_agency_id,managing_agency_acronym,...,activity_end_date,transaction_type_id,transaction_type_name,fiscal_year,transaction_date,current_dollar_amount,constant_dollar_amount,submission_activity_id,finance_type,dac_flow_id
178,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,1,USAID,...,,2,Obligations,2022,01MAY2022,-430,-415,50529,110,1
179,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,1,USAID,...,,3,Disbursements,2022,01JAN2022,370,357,50529,110,1
185,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,1,USAID,...,,2,Obligations,2020,01FEB2020,-3522,-3522,50939,110,1
186,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,1,USAID,...,,2,Obligations,2022,01JAN2022,3100,2993,50939,110,1
192,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,1,USAID,...,,3,Disbursements,2020,01OCT2019,373,373,50939,110,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3325473,1071,CS-KM,Kosovo,2,Europe and Eurasia,3.0,Upper Middle Income Country,UMIC,16,PC,...,,3,Disbursements,2021,31DEC2020,428,421,157,110,1
3325474,1071,CS-KM,Kosovo,2,Europe and Eurasia,3.0,Upper Middle Income Country,UMIC,16,PC,...,,3,Disbursements,2021,31MAR2021,268,264,157,110,1
3325475,1071,CS-KM,Kosovo,2,Europe and Eurasia,3.0,Upper Middle Income Country,UMIC,16,PC,...,,2,Obligations,2021,01OCT2020,-2679,-2634,474,110,1
3325476,1071,CS-KM,Kosovo,2,Europe and Eurasia,3.0,Upper Middle Income Country,UMIC,16,PC,...,,2,Obligations,2021,01JAN2021,-1719,-1690,474,110,1


# Text analytics
## Clustering activity description using topic modelling

In [18]:
 !pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [21]:
latest_foreign_aid['activity_description'] = latest_foreign_aid['activity_description'].astype(str)

In [22]:
# get text
activities = latest_foreign_aid['activity_description'].tolist()

In [23]:
uniq_activities = set(activities)

In [24]:
len(uniq_activities)

48198

In [25]:
model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english',
    calculate_probabilities=True,
    verbose=True
)

In [None]:

topics, probs = model.fit_transform(activities)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/20895 [00:00<?, ?it/s]

2023-05-24 15:03:35,745 - BERTopic - Transformed documents to Embeddings
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-8786a0fa781f>", line 1, in <cell line: 1>
    topics, probs = model.fit_transform(activities)
  File "/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py", line 356, in fit_transform
    umap_embeddings = self._reduce_dimensionality(embeddings, y)
  File "/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py", line 2868, in _reduce_dimensionality
    self.umap_model.fit(embeddings, y=y)
  File "/usr/local/lib/python3.10/dist-packages/umap/umap_.py", line 2684, in fit
    self.embedding_, aux_data = self._fit_embed_data(
  File "/usr/local/lib/python3.10/dist-packages/umap/umap_.py", line 2717, in _fit_embed_data
    return simplicial_set_embedding(
  File "/usr/local/lib/python3.10/dist-packages/umap/umap_.py", line 1156, in simplicial_set_embedding
 

In [None]:
%pwd

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-d3dc51b95a25>", line 1, in <cell line: 1>
    get_ipython().run_line_magic('pwd', '')
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2418, in run_line_magic
    result = fn(*args, **kwargs)
  File "<decorator-gen-84>", line 2, in pwd
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/magic.py", line 187, in <lambda>
    call = lambda f, *a, **k: f(*a, **k)
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/osm.py", line 295, in pwd
    return os.getcwd()
OSError: [Errno 107] Transport endpoint is not connected

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 20

In [None]:
# Save model
model.save("my_model")	

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-8f2f2cf0e969>", line 2, in <cell line: 2>
    model.save("my_model")
  File "/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py", line 2739, in save
    with open(path, 'wb') as file:
OSError: [Errno 107] Transport endpoint is not connected: 'my_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py", line 1101, i

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-8f2f2cf0e969>", line 2, in <cell line: 2>
    model.save("my_model")
  File "/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py", line 2739, in save
    with open(path, 'wb') as file:
OSError: [Errno 107] Transport endpoint is not connected: 'my_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", lin

In [None]:
# # Load model
# my_model = BERTopic.load("my_model")	