# Analyzing chart data completeness

Here, we check how complete the Top 200 charts data extracted in the previous step is.

## Load data

In [2]:
from helpers import create_data_path
import pandas as pd

In [3]:
kaggle_top200 = pd.read_parquet(create_data_path("kaggle_top200.parquet"))

## Get number of missing chart entries

In [4]:
kaggle_top200.dtypes

title       object
pos          int64
date        object
artist      object
url         object
region      object
trend       object
streams    float64
dtype: object

In [5]:
kaggle_top200.date = pd.to_datetime(kaggle_top200.date)

If the data was complete, we would have 200 tracks for every combination of region and date.

Let's get an overview of what we actually have:

In [6]:
track_counts_per_region_and_date = kaggle_top200.groupby(['region', 'date']).size().reset_index(name='track_count').sort_values(['region', 'date'])


In [7]:
track_counts_per_region_and_date

Unnamed: 0,region,date,track_count
0,Argentina,2017-01-01,200
1,Argentina,2017-01-02,200
2,Argentina,2017-01-03,200
3,Argentina,2017-01-04,200
4,Argentina,2017-01-05,200
...,...,...,...
109986,Vietnam,2021-11-26,200
109987,Vietnam,2021-11-27,200
109988,Vietnam,2021-11-28,200
109989,Vietnam,2021-11-29,200


In [8]:
region_count = track_counts_per_region_and_date.region.nunique()

In [9]:
start = track_counts_per_region_and_date.date.min()
end = track_counts_per_region_and_date.date.max()
start, end

(Timestamp('2017-01-01 00:00:00'), Timestamp('2021-12-31 00:00:00'))

In [10]:
delta = end.to_pydatetime() - start.to_pydatetime()
days_from_start_to_end = delta.days

days_from_start_to_end 

1825

In [11]:
expected_track_count_rowcount = region_count * (days_from_start_to_end + 1)
expected_track_count_rowcount

125994

In [12]:
track_counts_per_region_and_date.shape[0]

109991

Looks like quite a few values are non-existent...

In [13]:
track_counts_per_region_and_date.query('track_count < 200').size

58674

In [14]:
track_counts_per_region_and_date.query('track_count < 50').size

2796

## Missing values by region

In [15]:
missing_perc_per_region = (
    (
    (days_from_start_to_end + 1) - 
    track_counts_per_region_and_date.groupby('region').size()
    ) / (days_from_start_to_end + 1)
).sort_values(ascending=False)

In [16]:
missing_perc_per_region

region
Luxembourg       0.837349
South Korea      0.834611
Russia           0.723987
Ukraine          0.723987
Egypt            0.569003
                   ...   
United States    0.002738
Argentina        0.002191
Austria          0.002191
Bolivia          0.001643
Brazil           0.001643
Length: 69, dtype: float64

In [17]:
missing_perc_per_region[missing_perc_per_region > 0.1]

region
Luxembourg              0.837349
South Korea             0.834611
Russia                  0.723987
Ukraine                 0.723987
Egypt                   0.569003
Morocco                 0.525739
Saudi Arabia            0.500548
United Arab Emirates    0.489595
India                   0.447974
Nicaragua               0.397590
Bulgaria                0.366375
South Africa            0.299014
Israel                  0.256298
Vietnam                 0.256298
Romania                 0.256298
Estonia                 0.153888
Thailand                0.148412
Lithuania               0.114458
dtype: float64

In [18]:
missing_perc_per_region[missing_perc_per_region < 0.025].shape[0]

50

## Identify regions with good data coverage

In [19]:
regions_with_good_coverage = missing_perc_per_region[missing_perc_per_region < 0.025].index.tolist()
regions_with_good_coverage

['Slovakia',
 'Netherlands',
 'Portugal',
 'Panama',
 'Finland',
 'Germany',
 'Peru',
 'Poland',
 'Chile',
 'Norway',
 'Taiwan',
 'Turkey',
 'Guatemala',
 'Spain',
 'Czech Republic',
 'Sweden',
 'Colombia',
 'Costa Rica',
 'Philippines',
 'Paraguay',
 'Belgium',
 'Uruguay',
 'Singapore',
 'France',
 'New Zealand',
 'El Salvador',
 'Greece',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'Denmark',
 'Ireland',
 'Indonesia',
 'Italy',
 'Japan',
 'Ecuador',
 'Dominican Republic',
 'Malaysia',
 'Mexico',
 'Switzerland',
 'United Kingdom',
 'Canada',
 'Australia',
 'Global',
 'United States',
 'Argentina',
 'Austria',
 'Bolivia',
 'Brazil']

In [20]:
def write_list_to_file(file_path, string_list):
    with open(file_path, 'w') as file:
        for item in string_list:
            file.write(item + '\n')

In [21]:
write_list_to_file(create_data_path('regions.txt'), regions_with_good_coverage)