 # Simplificando pandas, lectura y escritura de archivos I

 Importamos las librerias a utilizar

In [2]:
import os
from pathlib import Path
import glob
import numpy as np
import pandas as pd

 # ¿Qué es pandas?

 [**pandas**](https://pandas.pydata.org/)

 **pandas** es una librería de código abierto con licencia BSD que
 proporciona estructuras de datos de alto rendimiento y fáciles de usar
 y herramientas de análisis de datos para el lenguaje de programación Python.

 ## Estructuras de pandas

 ### Series

 Arreglo unidimensional (vector), homogéneo y etiquetado.

 Creación de una Serie con una lista de valores

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=['a', 'b', 'c', 'd', 'e', 'f'])
s

a    1.0
b    3.0
c    5.0
d    NaN
e    6.0
f    8.0
dtype: float64

  ### Dataframes

 Estructura tabular bidimensional, de tamaño modificable, etiquetada

 con columnas potencialmente heterogéneas.

 Creación de un DataFrame usando un array de NumPy, fecha como índice (index)
 y  columnas etiquetadas:

In [4]:
dates = pd.date_range('20180501', periods=6)
dates

DatetimeIndex(['2018-05-01', '2018-05-02', '2018-05-03', '2018-05-04',
               '2018-05-05', '2018-05-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2018-05-01,1.344101,0.325384,0.215167,1.714706
2018-05-02,-0.243342,-0.646859,0.512364,0.364707
2018-05-03,1.389956,0.65684,0.893667,-0.830456
2018-05-04,-1.039423,0.043354,0.611307,-0.110691
2018-05-05,0.239502,0.485253,-0.321218,2.468113
2018-05-06,0.409115,-0.619463,0.775242,-1.029935


 Creación de un DataFrame usando diccionarios (dic)

In [6]:
df2 = pd.DataFrame({'A': 1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'D': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(["test", "train", "test", "train"]),
                   'F': 'foo'}, index = ['f', 'g', 'h', 'i'])
df2

Unnamed: 0,A,B,C,D,E,F
f,1.0,2013-01-02,,3,test,foo
g,1.0,2013-01-02,,3,train,foo
h,1.0,2013-01-02,,3,test,foo
i,1.0,2013-01-02,,3,train,foo


 # Módulo pathlib

In [7]:
# directorio actual
dir_prin = Path.cwd()
dir_prin

WindowsPath('c:/Users/Waltter/OneDrive/Canal/Code/07-pandas_csv')

In [8]:
# directorio con los datos
dir_entra = dir_prin/'entradas'
dir_entra

WindowsPath('c:/Users/Waltter/OneDrive/Canal/Code/07-pandas_csv/entradas')

In [9]:
# directorio con los datos intermedios
dir_inter = dir_prin /'intermedios'
dir_inter 

WindowsPath('c:/Users/Waltter/OneDrive/Canal/Code/07-pandas_csv/intermedios')

In [10]:
# directorio con los datos finales
dir_salid = dir_prin / 'salidas'
dir_salid 

WindowsPath('c:/Users/Waltter/OneDrive/Canal/Code/07-pandas_csv/salidas')

 # Lectura desde archivos csv

 ## Un archivo

In [11]:
datos = pd.read_csv(dir_entra / 'ejemplo.csv')

In [12]:
# Observar las primeras filas
datos.head()

Unnamed: 0,2,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
0,5.0,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
1,3.0,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
2,1.0,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
3,1.0,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
4,,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility


In [13]:
datos = pd.read_csv(dir_entra / 'ejemplo.csv', skiprows=5)
datos.head(10)

Unnamed: 0.1,Unnamed: 0,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
0,1,2018-01-01 00:00,0.0,22.44,22.73,18.1,0.77,,0.0,0.19,0.0,10.01
1,2,2018-01-01 01:00,0.0,21.88,22.15,17.75,0.77,,0.0,,0.0,10.01
2,3,2018-01-01 02:00,0.0,21.33,21.63,17.94,0.81,,0.0,,0.0,10.01
3,4,2018-01-01 03:00,0.0,20.77,21.07,17.75,0.83,,0.0,,0.0,10.01
4,5,2018-01-01 04:00,0.0,19.11,19.33,16.79,0.86,,0.0,,0.0,10.01
5,6,2018-01-01 05:00,0.0,21.63,21.8,16.91,0.75,,0.0,,0.0,10.01
6,7,2018-01-01 06:00,0.0,20.23,20.44,17.01,0.82,1014.9,3.85,0.38,0.0,10.01
7,8,2018-01-01 07:00,0.0,21.19,21.37,16.91,0.77,,0.0,0.19,0.0,10.01
8,9,2018-01-01 08:00,0.0,24.0,24.24,17.83,0.68,,3.93,0.19,1.0,10.01
9,10,2018-01-01 09:00,0.0,25.14,25.36,17.63,0.63,1016.29,9.4,0.38,3.0,10.01


In [14]:
# Observar las últimas filas
datos.tail(6)

Unnamed: 0.1,Unnamed: 0,time,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
738,739,2018-01-31 18:00,0.0,25.66,25.81,16.86,0.58,,16.0,,0.0,10.01
739,740,2018-01-31 19:00,0.0,23.34,23.42,16.31,0.65,,8.1,,0.0,10.01
740,741,2018-01-31 20:00,0.0,23.0,23.09,16.48,0.67,,6.05,,0.0,10.01
741,742,2018-01-31 21:00,0.0,23.0,23.04,15.99,0.65,,9.66,,0.0,10.01
742,743,2018-01-31 22:00,0.0,22.25,22.29,15.99,0.68,,1.61,,0.0,10.01
743,744,2018-01-31 23:00,0.0,21.69,21.73,15.86,0.69,,1.13,,0.0,10.01


In [15]:
# Dimensiones del Dataframe
datos.shape

(744, 12)

In [16]:
# Nombre de las columnas
datos.columns

Index(['Unnamed: 0', 'time', 'precipIntensity', 'temperature',
       'apparentTemperature', 'dewPoint', 'humidity', 'pressure', 'windSpeed',
       'cloudCover', 'uvIndex', 'visibility'],
      dtype='object')

In [17]:
# Index
datos.index

RangeIndex(start=0, stop=744, step=1)

In [18]:
# Información de las columnas
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 12 columns):
Unnamed: 0             744 non-null int64
time                   744 non-null object
precipIntensity        744 non-null float64
temperature            744 non-null float64
apparentTemperature    744 non-null float64
dewPoint               744 non-null float64
humidity               744 non-null float64
pressure               92 non-null float64
windSpeed              744 non-null float64
cloudCover             226 non-null float64
uvIndex                550 non-null float64
visibility             739 non-null float64
dtypes: float64(10), int64(1), object(1)
memory usage: 66.9+ KB


In [19]:
# Tipos de datos
datos.dtypes

Unnamed: 0               int64
time                    object
precipIntensity        float64
temperature            float64
apparentTemperature    float64
dewPoint               float64
humidity               float64
pressure               float64
windSpeed              float64
cloudCover             float64
uvIndex                float64
visibility             float64
dtype: object

 Más sobre [tipos de datos en python](https://data-flair.training/blogs/python-variables-and-data-types/)

 ### Eliminar una columna y cambiar el index

In [20]:
datosv2 = datos.drop(['Unnamed: 0'], axis=1)
datosv2 = datosv2.set_index('time')
datosv2.head()

Unnamed: 0_level_0,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01 00:00,0.0,22.44,22.73,18.1,0.77,,0.0,0.19,0.0,10.01
2018-01-01 01:00,0.0,21.88,22.15,17.75,0.77,,0.0,,0.0,10.01
2018-01-01 02:00,0.0,21.33,21.63,17.94,0.81,,0.0,,0.0,10.01
2018-01-01 03:00,0.0,20.77,21.07,17.75,0.83,,0.0,,0.0,10.01
2018-01-01 04:00,0.0,19.11,19.33,16.79,0.86,,0.0,,0.0,10.01


In [21]:
# Resumen estadístico
datosv2.describe()

Unnamed: 0,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
count,744.0,744.0,744.0,744.0,744.0,92.0,744.0,226.0,550.0,739.0
mean,0.003417,23.676828,23.968911,15.94328,0.631868,1013.697609,6.613763,0.319336,0.949091,10.006238
std,0.08789,3.62472,3.918883,2.557113,0.119676,2.834807,6.118495,0.18931,2.063904,0.071745
min,0.0,15.64,15.64,8.73,0.34,1000.81,0.0,0.0,0.0,8.63
25%,0.0,20.77,20.9675,14.305,0.54,1012.27,1.655,0.19,0.0,10.01
50%,0.0,23.055,23.205,16.245,0.64,1014.045,5.32,0.19,0.0,10.01
75%,0.0,26.465,26.6425,17.8375,0.71,1015.57,9.7625,0.44,0.0,10.01
max,2.3952,32.11,34.26,21.94,1.0,1018.91,31.21,1.0,8.0,10.01


In [22]:
# Valor de correlación lineal entre columnas
datosv2.corr()

Unnamed: 0,precipIntensity,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility
precipIntensity,1.0,-0.01145,-0.007576,0.067761,0.078216,-0.1557024,-0.022784,0.018847,-0.020835,0.002048446
temperature,-0.01145,1.0,0.995988,0.51534,-0.672548,-0.3504611,0.267242,-0.202636,0.66928,-0.004451623
apparentTemperature,-0.007576,0.995988,1.0,0.555388,-0.635117,-0.4108814,0.241933,-0.204376,0.66689,-0.003108271
dewPoint,0.067761,0.51534,0.555388,1.0,0.278692,-0.7794653,-0.230059,-0.137634,0.245027,-0.03346838
humidity,0.078216,-0.672548,-0.635117,0.278692,1.0,-0.3851452,-0.508523,0.111171,-0.563993,-0.01692519
pressure,-0.155702,-0.350461,-0.410881,-0.779465,-0.385145,1.0,0.436197,0.184996,0.091017,1.915374e-13
windSpeed,-0.022784,0.267242,0.241933,-0.230059,-0.508523,0.4361965,1.0,-0.067205,0.308071,0.05356197
cloudCover,0.018847,-0.202636,-0.204376,-0.137634,0.111171,0.184996,-0.067205,1.0,-0.183211,0.1597456
uvIndex,-0.020835,0.66928,0.66689,0.245027,-0.563993,0.09101665,0.308071,-0.183211,1.0,-0.08901127
visibility,0.002048,-0.004452,-0.003108,-0.033468,-0.016925,1.915374e-13,0.053562,0.159746,-0.089011,1.0
