### Import Libraries

In [1]:
import pandas as pd
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from analysis.StatisticalAnalysis.generalStatistics import GeneralStatistics
from analysis.StatisticalAnalysis.tweetsStatistics import TweetsStatistics
from utils.DataCleaning.CountryDataExtractor import CountryDataExtractor

### Import Data

In [3]:
covidDataGlobal = pd.read_csv("../../data/owid-covid-data.csv")

In [4]:
tweetsUS = pd.read_csv("../../data/tweetsUS.csv", names=["timestamp", "date", "lang", "text"])

In [5]:
tweetsGB = pd.read_csv("../../data/tweetsGB.csv", names=["timestamp", "date", "lang", "text"])

In [6]:
tweetsAU = pd.read_csv("../../data/tweetsAU.csv", names=["timestamp", "date", "lang", "text"])

In [7]:
sentimentsUS = pd.read_csv("../../data/sentimentsUS.csv")

In [8]:
sentimentsGB = pd.read_csv("../../data/sentimentsGB.csv")

In [9]:
sentimentsAU = pd.read_csv("../../data/sentimentsAU.csv")

In [10]:
countryDataExtractor = CountryDataExtractor(covidDataGlobal)

covidDataUS = countryDataExtractor.extractCountry("USA")

covidDataGB = countryDataExtractor.extractCountry("GBR")

covidDataAU = countryDataExtractor.extractCountry("AUS")

### Inizialize Statistics Class

In [11]:
usStatistics = GeneralStatistics(covidDataUS)
gbStatistics = GeneralStatistics(covidDataGB)
auStatistics = GeneralStatistics(covidDataAU)

In [12]:
usStatistics.dataset.columns #new_cases, new_cases_smoothed, new_deaths, new_deaths_smoothed, new_tests

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [13]:
usStatistics.computeMaxDayForCol("new_cases", indexCol="date"), gbStatistics.computeMaxDayForCol("new_cases", indexCol="date"), auStatistics.computeMaxDayForCol("new_cases", indexCol="date")

(              date  new_cases
 176994  2022-01-10  1383908.0,
               date  new_cases
 176218  2022-03-21   225992.0,
              date  new_cases
 10489  2022-01-12   175271.0)

In [14]:
usStatistics.computeMaxDayForCol("new_cases_smoothed", indexCol="date"), gbStatistics.computeMaxDayForCol("new_cases_smoothed", indexCol="date"), auStatistics.computeMaxDayForCol("new_cases_smoothed", indexCol="date")

(              date  new_cases_smoothed
 176999  2022-01-15          807814.143,
               date  new_cases_smoothed
 176143  2022-01-05          182908.143,
              date  new_cases_smoothed
 10490  2022-01-13          109214.714)

In [15]:
usStatistics.computeMaxDayForCol("new_deaths", indexCol="date"), gbStatistics.computeMaxDayForCol("new_deaths", indexCol="date"), auStatistics.computeMaxDayForCol("new_deaths", indexCol="date")

(              date  new_deaths
 176639  2021-01-20      4411.0,
               date  new_deaths
 175793  2021-01-20      1820.0,
              date  new_deaths
 10568  2022-04-01       357.0)

In [16]:
usStatistics.computeMaxDayForCol("new_deaths_smoothed", indexCol="date"), gbStatistics.computeMaxDayForCol("new_deaths_smoothed", indexCol="date"), auStatistics.computeMaxDayForCol("new_deaths_smoothed", indexCol="date")

(              date  new_deaths_smoothed
 176632  2021-01-13             3393.429,
               date  new_deaths_smoothed
 175796  2021-01-23             1248.714,
              date  new_deaths_smoothed
 10507  2022-01-30               87.429)

## Statistics on Tweets

In [17]:
tweetsStatistics = TweetsStatistics(tweetsUS)
tweetsStatistics.overview()

day with max tweets 2020: 
           day  count  year
67  2020-04-19    498  2020

day with max tweets 2021: 
            day  count  year
578  2021-09-12    184  2021

mean tweets count 2020: 167.6327160493827

mean tweets count 2021: 62.8021978021978

medan tweets count 2020: 142.5

median tweets count 2021: 56.5


In [18]:
tweetsStatisticsGB = TweetsStatistics(tweetsGB)
tweetsStatisticsGB.overview()

day with max tweets 2020: 
           day  count  year
57  2020-04-18    207  2020

day with max tweets 2021: 
            day  count  year
318  2021-01-04    142  2021

mean tweets count 2020: 50.78095238095238

mean tweets count 2021: 30.86813186813187

medan tweets count 2020: 46.0

median tweets count 2021: 26.0


In [19]:
tweetsStatisticsAU = TweetsStatistics(tweetsAU)
tweetsStatisticsAU.overview()

day with max tweets 2020: 
           day  count  year
33  2020-04-01     37  2020
50  2020-04-18     37  2020

day with max tweets 2021: 
            day  count  year
510  2021-07-24     30  2021

mean tweets count 2020: 9.49025974025974

mean tweets count 2021: 7.886111111111111

medan tweets count 2020: 8.0

median tweets count 2021: 6.0


## Statistics on Sentiments

In [20]:
usSentimentsStatistics = GeneralStatistics(sentimentsUS)
gbSentimentsStatistics = GeneralStatistics(sentimentsGB)
auSentimentsStatistics = GeneralStatistics(sentimentsAU)

In [21]:
usSentimentsStatistics.computeMaxDayForCol("negPercentage", perc=True), gbSentimentsStatistics.computeMaxDayForCol("negPercentage", perc=True), auSentimentsStatistics.computeMaxDayForCol("negPercentage", perc=True)

(           day  negPercentage
 15  2020-02-27       0.538462,
             day  negPercentage
 622  2021-11-04       0.741935,
             day  negPercentage
 402  2021-04-06            0.8
 598  2021-10-20            0.8)

In [22]:
usSentimentsStatistics.computeMaxDayForCol("posPercentage", perc=True), gbSentimentsStatistics.computeMaxDayForCol("posPercentage", perc=True), auSentimentsStatistics.computeMaxDayForCol("posPercentage", perc=True)

(            day  posPercentage
 578  2021-09-12       0.788043,
             day  posPercentage
 144  2020-07-14       0.666667,
             day  posPercentage
 198  2020-09-13            0.8
 216  2020-10-01            0.8
 224  2020-10-09            0.8
 375  2021-03-10            0.8)

In [23]:
usSentimentsStatistics.computeMaxMonthForCol("posPercentage", perc=True), gbSentimentsStatistics.computeMaxMonthForCol("posPercentage", perc=True), auSentimentsStatistics.computeMaxMonthForCol("posPercentage", perc=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datasetNoOutliers["month"] = datasetNoOutliers[indexCol].apply(lambda x:x[:7])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datasetNoOutliers["month"] = datasetNoOutliers[indexCol].apply(lambda x:x[:7])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datasetNoOutliers["month"] = datasetNoOutliers[

(month
 2020-05    0.32502
 Name: posPercentage, dtype: float64,
 month
 2020-02    0.444444
 Name: posPercentage, dtype: float64,
 month
 2020-06    0.431785
 Name: posPercentage, dtype: float64)

In [24]:
usSentimentsStatistics.computeMaxMonthForCol("negPercentage", perc=True), gbSentimentsStatistics.computeMaxMonthForCol("negPercentage", perc=True), auSentimentsStatistics.computeMaxMonthForCol("negPercentage", perc=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datasetNoOutliers["month"] = datasetNoOutliers[indexCol].apply(lambda x:x[:7])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datasetNoOutliers["month"] = datasetNoOutliers[indexCol].apply(lambda x:x[:7])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datasetNoOutliers["month"] = datasetNoOutliers[

(month
 2020-02    0.317724
 Name: negPercentage, dtype: float64,
 month
 2020-02    0.416667
 Name: negPercentage, dtype: float64,
 month
 2021-11    0.363528
 Name: negPercentage, dtype: float64)