In [3]:
import pandas as pd
import boto3
import os

In [4]:
# set up S3 client
s3 = boto3.client('s3')

# list all the CSV files in the S3 bucket
bucket_name = 'sector-classification'
prefix = 'Bert_Results_Africa_2015_2023/aggregated_results/'
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)['Contents']
files = [obj['Key'] for obj in objects if obj['Key'].endswith('.csv')]


In [None]:
files[

In [19]:
combined_df

Unnamed: 0,country,Month,Year,consumer goods average_tone,extractives & minerals processing average_tone,financials average_tone,food & beverage average_tone,health care average_tone,infrastructure average_tone,renewable resources & alternative energy average_tone,...,food & beverage group_size,health care group_size,infrastructure group_size,renewable resources & alternative energy group_size,resource transformation group_size,services group_size,technology & communications group_size,transportation group_size,Total Articles,Average Tones
57,ZI,10,2022,-0.501,0.202,-0.355,-0.200,-1.010,-0.064,-0.805,...,206.0,960.0,144.0,17.0,5333.0,2746.0,687.0,532.0,10880.0,-0.728437
57,ZI,9,2022,2.571,0.445,-0.616,0.249,-1.752,-0.177,-0.869,...,169.0,1042.0,167.0,37.0,5036.0,2214.0,634.0,537.0,10081.0,-0.779057
57,ZI,8,2022,0.098,0.073,-0.011,0.300,-1.920,0.397,-1.334,...,178.0,1076.0,165.0,22.0,4754.0,2136.0,695.0,487.0,9881.0,-0.589076
57,ZI,7,2022,1.705,1.394,0.436,0.592,-1.667,-0.557,0.103,...,155.0,874.0,188.0,19.0,5841.0,2731.0,673.0,525.0,11334.0,-0.881991
57,ZI,6,2022,3.218,0.969,0.326,0.145,-2.150,-0.225,-1.024,...,183.0,826.0,175.0,19.0,5258.0,2101.0,694.0,539.0,10042.0,-1.107878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,AG,7,2015,-0.067,-0.563,0.987,1.106,-2.343,-0.049,-0.825,...,66.0,725.0,158.0,7.0,6750.0,1710.0,1268.0,361.0,11262.0,-2.635059
0,AG,6,2015,0.086,-0.909,1.479,0.322,-2.153,-0.103,-0.603,...,71.0,680.0,98.0,3.0,10803.0,2456.0,1373.0,511.0,16527.0,-3.305134
0,AG,5,2015,1.410,-0.595,0.912,0.819,-1.368,-2.071,0.786,...,70.0,671.0,105.0,8.0,8304.0,1964.0,1186.0,421.0,13477.0,-1.480875
0,AG,4,2015,1.881,-0.309,1.223,1.242,-2.338,0.086,-0.983,...,60.0,637.0,103.0,3.0,8076.0,2372.0,1335.0,341.0,13129.0,-2.971011


In [33]:
# combine all the CSV files into one DataFrame
dfs = []
for file in files:
    obj = s3.get_object(Bucket=bucket_name, Key=file)
    df = pd.read_csv(obj['Body'])
    dfs.append(df)
combined_df = pd.concat(dfs)

# rename 'total' column to 'Total Articles'
combined_df = combined_df.rename(columns={'total': 'Total Articles'})

# rename 'total_average' column to 'Average Tones'
combined_df = combined_df.rename(columns={'total_average': 'Average Tones'})

# sort the DataFrame by country, year, month in descending order
combined_df = combined_df.sort_values(by=['country', 'Year', 'Month'], ascending=True)


In [34]:
# create a mapping of old column names to new column names
col_mapping = {col: col.replace('group_size', '- article count').replace('average_tone', '- tone')
               for col in combined_df.columns if col.endswith(('group_size', 'average_tone'))}

# rename the columns using the mapping
combined_df = combined_df.rename(columns=col_mapping)

# reassign the renamed columns to the original dataframe
combined_df.columns = [col_mapping.get(col, col) for col in combined_df.columns]

In [35]:
combined_df

Unnamed: 0,country,Month,Year,consumer goods - tone,extractives & minerals processing - tone,financials - tone,food & beverage - tone,health care - tone,infrastructure - tone,renewable resources & alternative energy - tone,...,food & beverage - article count,health care - article count,infrastructure - article count,renewable resources & alternative energy - article count,resource transformation - article count,services - article count,technology & communications - article count,transportation - article count,Total Articles,Average Tones
0,AG,3,2015,0.823,-1.748,1.375,0.842,-2.807,-0.334,0.410,...,76.0,729.0,120.0,9.0,10390.0,2251.0,1397.0,795.0,16055.0,-2.734868
0,AG,4,2015,1.881,-0.309,1.223,1.242,-2.338,0.086,-0.983,...,60.0,637.0,103.0,3.0,8076.0,2372.0,1335.0,341.0,13129.0,-2.971011
0,AG,5,2015,1.410,-0.595,0.912,0.819,-1.368,-2.071,0.786,...,70.0,671.0,105.0,8.0,8304.0,1964.0,1186.0,421.0,13477.0,-1.480875
0,AG,6,2015,0.086,-0.909,1.479,0.322,-2.153,-0.103,-0.603,...,71.0,680.0,98.0,3.0,10803.0,2456.0,1373.0,511.0,16527.0,-3.305134
0,AG,7,2015,-0.067,-0.563,0.987,1.106,-2.343,-0.049,-0.825,...,66.0,725.0,158.0,7.0,6750.0,1710.0,1268.0,361.0,11262.0,-2.635059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,ZI,6,2022,3.218,0.969,0.326,0.145,-2.150,-0.225,-1.024,...,183.0,826.0,175.0,19.0,5258.0,2101.0,694.0,539.0,10042.0,-1.107878
57,ZI,7,2022,1.705,1.394,0.436,0.592,-1.667,-0.557,0.103,...,155.0,874.0,188.0,19.0,5841.0,2731.0,673.0,525.0,11334.0,-0.881991
57,ZI,8,2022,0.098,0.073,-0.011,0.300,-1.920,0.397,-1.334,...,178.0,1076.0,165.0,22.0,4754.0,2136.0,695.0,487.0,9881.0,-0.589076
57,ZI,9,2022,2.571,0.445,-0.616,0.249,-1.752,-0.177,-0.869,...,169.0,1042.0,167.0,37.0,5036.0,2214.0,634.0,537.0,10081.0,-0.779057


In [36]:
# write the combined DataFrame to a CSV file
csv_buffer = combined_df.to_csv(index=False)
s3.put_object(Bucket=bucket_name, Key=prefix + 'combined_results.csv', Body=csv_buffer)

print('Combined CSV file saved successfully to S3.')

Combined CSV file saved successfully to S3.
