# 1.2 Stages of data analysis

1. Investigate information requirements
2. Data collection
3. Data organisation 
4. Data storage 
5. Data cleansing
6. Data manipulation
7. Presentation of findings 

## Setup

In [1]:
import micropip
await micropip.install(["pyoliteutils", "xlrd"])

In [2]:
from pyoliteutils import *
import pyoliteutils
import pandas as pd
import matplotlib.pyplot as plt

pyoliteutils.__version__

'0.0.10'

## 1.2 Stages of data analysis

In [3]:
mm("""
mindmap
  root{{Stages of data analysis}}
    1(Investigate information requirements)
        What do we need to know to make a decision\?
            e.g. market share, particulates in the air, testing of new drugs
    2(Data collection)
        How can we gather that data
            e.g. observations, interviews, review of existing data
    3(Data organisation) 
        How to organise the data so we can work with it
            e.g. digitalisation, transcription, sorting, data mining
    4(Data storage)
        How and where to store the data
            e.g. in-house, external
    5(Data cleansing) 
        Tidying up the data
            e.g. errors, missing elements, duplicates
    6(Data manipulation) 
        Working with the information to find the knowledge
            e.g. arranging, collating, aggregating, interpreting, correlation
    7(Presentation of findings) 
        Seeing the knowledge so you can make Wise decision
            e.g. tables, charts, graphs, dashboard, reports
""")

## 1 : Investigate Information Requirements 

### What do we need to know to make a decision?

Is a new drug effective?

(e.g. market share, particulates in the air, testing of new drugs)

## 2 : Data Collection 

### How can we gather that data?

e.g. 
* observations
* interviews
* review of existing data
* making sure IT systems record relevant data

## 3: Data Organisation 

### How to organise the data so we can work with it

e.g. 
* digitalisation
* transcription
* sorting
* data mining

## 4: Data Storage

### How and where to store the data

e.g. 
* in–house
* external)

In [None]:

import json
IBMfile = await get_file_from_url("https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=IBM&apikey=demo")

data = None
with open(IBMfile) as f:
    data = json.load(f)

#display(data)

df = pd.DataFrame(data["Time Series (Daily)"])
df

In [None]:
vostok_temp_core_url = "https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-1e57f3f83864c10-20180717T104354142744"
vostok_temp_core_file = await load_file_into_in_mem_filesystem(vostok_temp_core_url)

In [None]:
vostok_temp_core_df = pd.read_table(vostok_temp_core_file, skiprows=58, sep="\s\s+", header=0, names=["Depth (m)", "Age of the ice (yr BP)","Deuterium content of the ice (delta D)", "Temperature Variation (deg C)"])
vostok_temp_core_df.head()


## 5: Data Cleansing

e.g.
* errors
* missing elements
* duplicates

## 6: Data Manipulation

e.g. 
* arranging
* collating
* aggregating
* interpreting
* correlation

## 7: Presentation of Findings

e.g. 
* tables
* charts
* graphs
* dashboard
* reports

In [None]:
#datasetfile = await get_file_from_url("https://qualifications.pearson.com/content/dam/pdf/A%20Level/Mathematics/2017/specification-and-sample-assesment/Pearson%20Edexcel%20GCE%20AS%20and%20AL%20Mathematics%20data%20set%20-%20Issue%201%20(1).xls")
#datasetfile

In [None]:
#sheets = pd.read_excel(datasetfile, sheet_name=None, header=5, parse_dates=True, skipfooter=4) #d,  date_format=None ate_parser=_NoDefault.no_default,
#keys(sheets)
#sheets

In [None]:
co2file = await get_file_from_url("https://raw.githubusercontent.com/UTCSheffield/OCR-Unit-7-Data-analysis-and-design/main/content/data/annual-co2-emissions-per-country.csv")
tempfile = await get_file_from_url("https://raw.githubusercontent.com/UTCSheffield/OCR-Unit-7-Data-analysis-and-design/main/content/data/temperature-anomaly.csv")

co2df = pd.read_csv(co2file)
tempdf = pd.read_csv(tempfile)


In [None]:
worldco2 = co2df.query("Entity == 'World'")[["Year", "Annual CO₂ emissions"]]
worldco2.set_index("Year")



In [None]:
plt.style.use('_mpl-gallery-nogrid')

# plot
fig, ax = plt.subplots()
ax.plot(worldco2["Year"], worldco2["Annual CO₂ emissions"])
#ax.plot(worldco2)

plt.show()

#worldco2.plot.line().show()



In [None]:
from bokeh.plotting import figure, show

# create a new plot with a title and axis labels
p1 = figure(title="Annual CO₂ emissions", y_range=(0, 3.5e10), x_axis_label="Year", y_axis_label="Annual CO₂ emissions", height=400, width=1000)

# add a line renderer with legend and line thickness
p1.line(worldco2["Year"], worldco2["Annual CO₂ emissions"], legend_label="Temp.", line_width=2)

# show the results
show(p1)

In [None]:
worldtemps = tempdf.query("Entity == 'Global'")[["Year", "Global average temperature anomaly relative to 1961-1990"]]
worldtemps

In [None]:
from bokeh.plotting import figure, show

# create a new plot with a title and axis labels
p2 = figure(title="Global average temperature anomaly", x_axis_label="Year",y_range=(-1, 1), y_axis_label="Global average temperature anomaly relative to 1961-1990", height=400, width=1000)
# add a line renderer with legend and line thickness
p2.line(worldtemps["Year"], worldtemps["Global average temperature anomaly relative to 1961-1990"], legend_label="Temp.", line_width=2)

# show the results
show(p2)

In [None]:
from bokeh.layouts import row, column
# create a new plot with a title and axis labels
show(column(p1, p2))

In [None]:
#3.5e+10

In [None]:
from numpy import arange, linspace, pi, sin

from bokeh.layouts import column
from bokeh.models import (CustomJS, LinearAxis, Range1d, Select,
                          WheelZoomTool, ZoomInTool, ZoomOutTool)
from bokeh.palettes import Bokeh6


from bokeh.plotting import figure, show

#p1 = figure(title="Annual CO₂ emissions", y_range=(0, 3.5e10), x_axis_label="Year", y_axis_label="Annual CO₂ emissions", height=400, width=1000)

# add a line renderer with legend and line thickness
#p.line(worldco2["Year"], worldco2["Annual CO₂ emissions"], legend_label="Temp.", line_width=2)


# create a new plot with a title and axis labels
#p2 = figure(title="Global average temperature anomaly", x_axis_label="Year",y_range=(-1, 1), y_axis_label="Global average temperature anomaly relative to 1961-1990", height=400, width=1000)
# add a line renderer with legend and line thickness
#



x = arange(-2*pi, 2*pi, 0.2)
y = sin(x)
y2 = linspace(0, 100, len(x))

blue, red = Bokeh6[5], Bokeh6[0]

p = figure( y_range=(-1, 1))
p.line(worldtemps["Year"], worldtemps["Global average temperature anomaly relative to 1961-1990"], legend_label="Temp.", line_width=2)

p.background_fill_color = "#fafafa"


p.axis.axis_label = "Year"
p.axis.axis_label_text_color = blue

#p.extra_x_ranges['foo'] = 
p.extra_y_ranges['foo'] = Range1d(0, 3.5e10)
red_circles = p.scatter(x, y2, color=red, size=8,
    x_range_name="foo",
    y_range_name="foo",
)

ax2 = LinearAxis(
    axis_label="red circles",
    x_range_name="foo",
    y_range_name="foo",
)
ax2.axis_label_text_color = red
p.add_layout(ax2, 'left')

ax3 = LinearAxis(
    axis_label="red circles",
    x_range_name="foo",
    y_range_name="foo",
)
ax3.axis_label_text_color = red
p.add_layout(ax3, 'below')

from bokeh.io import output_notebook
output_notebook()

try:
    show(p)
    #show(column(select, p))
except ImportError:
    pass
