# Exploratory Data Analysis

## Setting up

In [3]:
import os
import pandas as pd
from zipfile import ZipFile
import plotly.express as px

from src.config.general import PATH_DATA_RAW

In [2]:
path_zip = os.path.join(PATH_DATA_RAW, 'Berkeley Earth Surface Temperature Data.zip')

zip_file = ZipFile(path_zip)
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
       for text_file in zip_file.infolist()
       if text_file.filename.endswith('.csv')}

In [4]:
type(dfs)

dict

In [5]:
dfs.keys()

dict_keys(['GlobalLandTemperaturesByCity.csv', 'GlobalLandTemperaturesByCountry.csv', 'GlobalLandTemperaturesByMajorCity.csv', 'GlobalLandTemperaturesByState.csv', 'GlobalTemperatures.csv'])

In [6]:
for df in dfs.keys():
    print(f'{df}: {dfs[df].shape}')

GlobalLandTemperaturesByCity.csv: (8599212, 7)
GlobalLandTemperaturesByCountry.csv: (577462, 4)
GlobalLandTemperaturesByMajorCity.csv: (239177, 7)
GlobalLandTemperaturesByState.csv: (645675, 5)
GlobalTemperatures.csv: (3192, 9)


## Global Temperature

In [7]:
dfs['GlobalTemperatures.csv'].head(2)

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,


### Land Temperature over the years by months

In [25]:
fig = px.line(dfs['GlobalTemperatures.csv'], 
              x='dt', 
              y="LandAverageTemperature",
              title="Land Temperature by months from 1750 to 2016")
fig.show()

### Land Temperature over the years by years

In [12]:
dfs['GlobalTemperatures.csv']['year'] = dfs['GlobalTemperatures.csv']['dt'].apply(lambda x: x[:4])

In [16]:
df_grouped_by_year = dfs['GlobalTemperatures.csv'].groupby('year')["LandAverageTemperature"].agg('mean').reset_index()

fig = px.line(df_grouped_by_year, 
              x='year', 
              y="LandAverageTemperature",
              title="Land Temperature over decades")
fig.show()

### Temperature evolution

In [24]:
overall_mean_temperature = df_grouped_by_year['LandAverageTemperature'].mean()
df_grouped_by_year['delta_temperature'] = df_grouped_by_year. \
    apply(lambda row: row['LandAverageTemperature'] - overall_mean_temperature, axis=1)

fig = px.area(df_grouped_by_year, x='year', y='delta_temperature', )
fig.show()

In [21]:
px.data.stocks(indexed=True)-1

company,GOOG,AAPL,AMZN,FB,NFLX,MSFT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2018-01-08,0.018172,0.011943,0.061881,-0.040032,0.053526,0.015988
2018-01-15,0.032008,0.019771,0.053240,-0.029757,0.049860,0.020524
2018-01-22,0.066783,-0.019943,0.140676,0.016858,0.307681,0.066561
2018-01-29,0.008773,-0.082857,0.163374,0.018357,0.273537,0.040708
...,...,...,...,...,...,...
2019-12-02,0.216280,0.546914,0.425061,0.075997,0.463641,0.720717
2019-12-09,0.222821,0.572286,0.432660,0.038855,0.421496,0.752239
2019-12-16,0.224418,0.596800,0.453455,0.104094,0.604362,0.784896
2019-12-23,0.226504,0.656000,0.521226,0.113728,0.567170,0.802472
