# AGGREGATION & COMBINATION

In [77]:
import numpy as np
import pandas as pd

## Agregación de datos

`groupby, tabla dinámica y tabla cruzada`

In [78]:
df = pd.read_csv("../datasets/avocado_kaggle.csv")
df.drop("Unnamed: 0", inplace=True, axis=1)

In [79]:
df.sample()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
12932,2016-10-09,1.48,3479.15,133.02,1800.96,0.0,1545.17,677.65,867.52,0.0,organic,2016,Indianapolis


In [80]:
df["type"].unique()

array(['conventional', 'organic'], dtype=object)

In [81]:
df["region"].unique()

array(['Albany', 'Atlanta', 'BaltimoreWashington', 'Boise', 'Boston',
       'BuffaloRochester', 'California', 'Charlotte', 'Chicago',
       'CincinnatiDayton', 'Columbus', 'DallasFtWorth', 'Denver',
       'Detroit', 'GrandRapids', 'GreatLakes', 'HarrisburgScranton',
       'HartfordSpringfield', 'Houston', 'Indianapolis', 'Jacksonville',
       'LasVegas', 'LosAngeles', 'Louisville', 'MiamiFtLauderdale',
       'Midsouth', 'Nashville', 'NewOrleansMobile', 'NewYork',
       'Northeast', 'NorthernNewEngland', 'Orlando', 'Philadelphia',
       'PhoenixTucson', 'Pittsburgh', 'Plains', 'Portland',
       'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'Sacramento',
       'SanDiego', 'SanFrancisco', 'Seattle', 'SouthCarolina',
       'SouthCentral', 'Southeast', 'Spokane', 'StLouis', 'Syracuse',
       'Tampa', 'TotalUS', 'West', 'WestTexNewMexico'], dtype=object)

In [82]:
df["year"].unique()

array([2015, 2016, 2017, 2018])

### groupby
La función GroupBy de Pandas es una función potente y versátil en Python. Nos permite dividir los datos en grupos separados para realizar cálculos que permitan un mejor análisis.
Un DataFrame se puede agrupar en sus filas (eje=0) o en sus columnas (eje=1). Una vez hecho esto, se aplica una función a cada grupo, lo que produce un nuevo valor. Finalmente, los resultados de todas esas aplicaciones de funciones se combinan en un objeto de resultado. La forma del objeto resultante generalmente dependerá de lo que se haga con los datos.

`df[subset].groupby(category).aggregation()`

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

In [83]:
df.columns

Index(['Date', 'AveragePrice', 'Total Volume', '4046', '4225', '4770',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'],
      dtype='object')

In [84]:
# precio medio por año
# ciudades con más filas
# precio pro type

In [85]:
df.describe()

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
count,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0
mean,1.405978,850644.0,293008.4,295154.6,22839.74,239639.2,182194.7,54338.09,3106.426507,2016.147899
std,0.402677,3453545.0,1264989.0,1204120.0,107464.1,986242.4,746178.5,243966.0,17692.894652,0.939938
min,0.44,84.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,1.1,10838.58,854.07,3008.78,0.0,5088.64,2849.42,127.47,0.0,2015.0
50%,1.37,107376.8,8645.3,29061.02,184.99,39743.83,26362.82,2647.71,0.0,2016.0
75%,1.66,432962.3,111020.2,150206.9,6243.42,110783.4,83337.67,22029.25,132.5,2017.0
max,3.25,62505650.0,22743620.0,20470570.0,2546439.0,19373130.0,13384590.0,5719097.0,551693.65,2018.0


In [86]:
# averageprice en el centil 75 es 1.66
    # un 25% está por encima
    # el 75 % es igual o está por debajo

# centil: qué proporción de datos quedan por debajo de ese valor
    # el valor NO es 75; el valor es 1.66
    # El 75% de los aguacates: valen igual o menos que 1.66
    # y el 25%: vale más de 1.66


# 4 kg y está en el centil 80%
    # que pesa más que el 80% de su grupo
    # hay un 20% de bebés de su grupo que pesan más de 4kg

# centil 75% -> 

# centil 50: por debajo de el valor que corresponda con el centil 50, estará el 50% de los datos
    # y el 50% por encima

# media y mediana:
    # media: el valor medio
    # mediana: el valor que me separa el 50% de los datos -> centil 50%
    # media: sensible a outliers
    # mediana: menos sensible a los datos atípicos 

In [87]:
df.sample()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
4883,2016-01-31,0.99,121477.84,32756.64,52095.37,135.09,36490.74,34167.03,2323.71,0.0,conventional,2016,Roanoke


In [88]:
# AGRUPO por categorías / categórico (dimensiones con x niveles)
# AGREGO por medidas / cuantitativas

df.groupby("type").agg({"AveragePrice":"mean"}).sort_values(by="AveragePrice", ascending=False).reset_index()["AveragePrice"]

0    1.653999
1    1.158040
Name: AveragePrice, dtype: float64

In [89]:
df.groupby("type").agg({"type":"count"})

Unnamed: 0_level_0,type
type,Unnamed: 1_level_1
conventional,9126
organic,9123


In [90]:
df.groupby("type").agg({"AveragePrice":"max"}).sort_values(by="AveragePrice", ascending=False)

Unnamed: 0_level_0,AveragePrice
type,Unnamed: 1_level_1
organic,3.25
conventional,2.22


In [91]:
df.groupby("type").agg({"AveragePrice":"min"}).sort_values(by="AveragePrice", ascending=False)

Unnamed: 0_level_0,AveragePrice
type,Unnamed: 1_level_1
conventional,0.46
organic,0.44


In [92]:
df.groupby("type").agg({"AveragePrice":["mean", "max", "min"]})

Unnamed: 0_level_0,AveragePrice,AveragePrice,AveragePrice
Unnamed: 0_level_1,mean,max,min
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
conventional,1.15804,2.22,0.46
organic,1.653999,3.25,0.44


In [93]:
df.groupby("type").agg({"AveragePrice":["mean", "max", "min"]}).reset_index()

Unnamed: 0_level_0,type,AveragePrice,AveragePrice,AveragePrice
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,min
0,conventional,1.15804,2.22,0.46
1,organic,1.653999,3.25,0.44


In [94]:
df.groupby("type")[["AveragePrice"]].max()

Unnamed: 0_level_0,AveragePrice
type,Unnamed: 1_level_1
conventional,2.22
organic,3.25


In [95]:
df.groupby("type")["AveragePrice"].max()

type
conventional    2.22
organic         3.25
Name: AveragePrice, dtype: float64

In [96]:
# df.groupby("type")["type"].count().reset_index().rename(columns={"type": "tipo"})

In [97]:
# .reset_index()
# .reset_index(drop=False) -> lo guardas como columns

In [98]:
df.index = df.year

In [129]:
df.reset_index(drop=True, inplace=True)

In [132]:
df.drop(2, axis=0, inplace=True)

In [137]:
#df

In [100]:
df.loc[2015] #loc & iloc ?

# L(label)oc -> EL VALOR
# I(index)loc -> LA posición

Unnamed: 0_level_0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
2015,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2015,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
2015,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
2015,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015,2015-02-01,1.77,7210.19,1634.42,3012.44,0.00,2563.33,2563.33,0.00,0.0,organic,2015,WestTexNewMexico
2015,2015-01-25,1.63,7324.06,1934.46,3032.72,0.00,2356.88,2320.00,36.88,0.0,organic,2015,WestTexNewMexico
2015,2015-01-18,1.71,5508.20,1793.64,2078.72,0.00,1635.84,1620.00,15.84,0.0,organic,2015,WestTexNewMexico
2015,2015-01-11,1.69,6861.73,1822.28,2377.54,0.00,2661.91,2656.66,5.25,0.0,organic,2015,WestTexNewMexico


In [101]:
df.shape[0]

18249

`agrupar por departamentos y su edad media`

`agrupar por departamentos y su edad máxima`

`agrupar por departamentos y su edad mínima`

`groupby: 2+`: Diferentes campos educativos y sus salarios medios en cada uno de los departamentos

In [102]:
# multiindex: https://pandas.pydata.org/docs/user_guide/advanced.html

`más campos`

#### agrupar y agregar

Otra forma de agrupar con sintaxis agg
https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html

- media
- suma
- recuento

Supongamos que queremos calcular la media de la columna data1 utilizando las etiquetas de key1. Hay varias formas de hacerlo. Una es acceder a data1 y llamar a groupby con la columna (una String) en key1.
Los datos (una Serie) se han agregado en función de la clave de grupo, lo que produce una nueva String que ahora está indexada por los valores únicos de la columna key1. El índice resultante tiene el nombre 'key1' porque la columna DataFrame `df['key1']` lo tenía.

### Tablas dinámicas

[Comparación entre tabla dinámica y tabla agrupada](https://towardsdatascience.com/una-comparacion-entre-tabla-agrupada-y-tabla-dinámica-en-pythons-pandas-module-527909e78d6b?source=userActivityShare-4c6d9a33c6-1674605913&_branch_match_id=1146490620103499300&_branch_referrer=H4sIAAAAAAAAA8soKSkottLXz8nMy9bLTU3JLM3VS87P1TfKCvCtKvOvCCpPAgAN4ht%2FIwAAAA%3D%3D)

```pivot_table(df, valores=Ninguno, índice=Ninguno, columnas=Ninguno, aggfunc='media', ...)```

* Podemos crear una tabla dinámica de estilo hoja de cálculo como un DataFrame. Los niveles de la tabla dinámica se almacenarán en objetos MultiIndex (índices jerárquicos) en el índice y las columnas del DataFrame resultante.

Tabla dinámica: el departamento y sus edades medias

Tabla dinámica: el departamento y sus edades máximas

`tabla dinámica`: el departamento y sus edades mínimas

Tabla dinámica: el departamento y su edad media

Tabla dinámica de múltiples índices: Departamento y campo de educación y verificación de los valores salariales máximos de los empleados

### Diferencias entre la tabla dinámica y la función groupby

`groupby`: edad media del equipo por abreviatura del equipo y temporada.

`pivot_table`: edad media del equipo por abreviatura del equipo y temporada.

### Crosstab

* Calcular una tabulación cruzada simple de dos (o más) factores. De manera predeterminada, calcula una tabla de frecuencias de los factores a menos que se pase una matriz de valores y una función de agregación. Generalmente se utiliza para ver la frecuencia de dos variables cualitativas: cuántas veces el valor de una columna aparece en la otra.

```pd.crosstab(df[columna1], df[columna2])```

-------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- ----

## Hands-on

#### Import `business.csv` and respond to to following questions:
##### Revenue & Sales Analysis
	1.	Which country has generated the highest total sales?
	2.	Which city has the highest average purchase amount?
	3.	What is the total revenue for each product category?
	4.	Which country has the most transactions with negative amounts?
	5.	How many purchases were made without a discount, and how much revenue did they generate?
##### Customer Insights
	6.	Who is the top-spending customer overall?
	7.	Which customer has spent the most in the “Electronics” category?
	8.	How many customers have made repeat purchases?
	9.	Which customer has received the most discounts?
##### Review Sentiment & Correlations
	10.	Are negative reviews correlated with low purchase amounts?
	11.	Which country has the most positive vs. negative reviews?
	12.	Do customers who received discounts leave better reviews?
	13.	Are refunds (negative amounts) more common among customers who left bad reviews?

## Combinación de datos: mezcla de Dataframes

- https://realpython.com/pandas-merge-join-and-concat/
- https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

### Concatenación: dos cosas juntas
Unimos los dataframes a lo largo del eje 0, uno debajo del otro. Alineamos las columnas por etiqueta.

#### Concatenación en el eje 0 (filas)

#### Concatenación en el eje 1 (columnas)

![Uniones SQL](https://upload.wikimedia.org/wikipedia/commons/9/9d/SQL_Joins.svg)

### MERGE: columnas relacionadas

[Merge()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html) es más útil cuando desea fusionar filas que comparten datos.

In [103]:
from IPython.display import display_html 

In [104]:
df1 = pd.DataFrame(
    {
        "key1": ["K0", "K0", "K1", "K2"],
        "key2": ["K0", "K1", "K0", "K1"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)


df2 = pd.DataFrame(
    {
        "key1": ["K0", "K1", "K1", "K2"],
        "key2": ["K0", "K0", "K0", "K0"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)

df3 = pd.DataFrame(
    {
        "key1": ["K90", "K1", "K1", "K2"],
        "key2": ["K0", "K0", "K5", "K0"],
        "C": ["C0", "C1", "C2", "C3"],
        "Y": ["D0", "D70", "D2", "D5"],
    }
)

In [105]:
df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Right table')
display_html(df1_styler._repr_html_() + " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [106]:
# merge, default is inner

result = pd.merge(df1, df2)
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


#### LEFT MERGE

In [107]:
left_merge = pd.merge(df1, df2, how="left")


df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Right table')
left_merge = left_merge.style.set_table_attributes("style='display:inline'").set_caption("left_merge")

display_html(df1_styler._repr_html_() +  " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [108]:
display_html(left_merge._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,


#### RIGHT MERGE

In [109]:
right_merge = pd.merge(df1, df2, how="right")


df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Right table')
right_merge = right_merge.style.set_table_attributes("style='display:inline'").set_caption("right_merge")

display_html(df1_styler._repr_html_() + " " + df2_styler._repr_html_(),raw=True)

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [110]:
display_html(right_merge._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2
3,K2,K0,,,C3,D3


#### Fusión INTERNA

In [111]:
inner_merge = pd.merge(df1, df2, how="inner")


df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Right table')
inner_merge = inner_merge.style.set_table_attributes("style='display:inline'").set_caption("inner_merge")

display_html(df1_styler._repr_html_() +  " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [112]:
display_html(inner_merge._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


#### OUTER MERGE

In [113]:
outer_merge = pd.merge(df1, df2, how="outer")

df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Right table')
outer_merge = outer_merge.style.set_table_attributes("style='display:inline'").set_caption("outer_merge")

display_html(df1_styler._repr_html_() +  " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [114]:
display_html(outer_merge._repr_html_(), raw=True)

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K0,,,C3,D3
5,K2,K1,A3,B3,,


### JOIN & CONCAT on different columns

In [115]:
df1_docs = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})
df2_docs = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})

In [116]:
df1_styler = df1_docs.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2_docs.style.set_table_attributes("style='display:inline'").set_caption('Right table')

display_html(df1_styler._repr_html_() +  " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


#### Concatenación en dos columnas diferentes

In [117]:
concat_docs = pd.concat([df1_docs, df2_docs], axis=1, keys=["1st table", "2nd table"])

df1_styler = df1_docs.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2_docs.style.set_table_attributes("style='display:inline'").set_caption('Right table')
merge_styler = concat_docs.style.set_table_attributes("style='display:inline'").set_caption('concat_docs')

display_html(df1_styler._repr_html_() +  " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [118]:
display_html(merge_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,1st table,1st table,2nd table,2nd table
Unnamed: 0_level_1,lkey,value,rkey,value
0,foo,1,foo,5
1,bar,2,bar,6
2,baz,3,baz,7
3,foo,5,foo,8


#### Merge en dos columnas diferentes

In [119]:
df1_styler = df1_docs.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2_docs.style.set_table_attributes("style='display:inline'").set_caption('Right table')

display_html(df1_styler._repr_html_() +  " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [120]:
merge_docs = df1_docs.merge(df2_docs, left_on='lkey', right_on='rkey', suffixes = ["_fromleft", "_fromright"])

In [121]:
merge_docs = df1_docs.merge(df2_docs, left_on='lkey', right_on='rkey', suffixes = ["_from_left", "_from_right"])


df1_styler = df1_docs.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = df2_docs.style.set_table_attributes("style='display:inline'").set_caption('Right table')
merge_styler = merge_docs.style.set_table_attributes("style='display:inline'").set_caption('merge_docs')

display_html(df1_styler._repr_html_() + " " + df2_styler._repr_html_(), raw=True)

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [122]:
display_html(merge_styler._repr_html_(), raw=True)

Unnamed: 0,lkey,value_from_left,rkey,value_from_right
0,foo,1,foo,5
1,foo,1,foo,8
2,bar,2,bar,6
3,baz,3,baz,7
4,foo,5,foo,5
5,foo,5,foo,8


In [123]:
# suffixes: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html

In [124]:
# concat: putting things together
# join/merge: putting things together THAT ARE RELATED
    # related info in the same place -> reduces redundancy

### Join: índice relacionado
El join, a diferencia del merge, unirá los dataframes y donde no haya registros en el "índice" pondrá NaN

In [125]:
import numpy as np
import pandas as pd   
from IPython.display import display_html 

In [126]:
left_df = pd.DataFrame(
    {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"]
)


right_df = pd.DataFrame(
    {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"]
)

In [127]:
df1_styler = left_df.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = right_df.style.set_table_attributes("style='display:inline'").set_caption('Right table')

display_html(df1_styler._repr_html_()+ df2_styler._repr_html_(), raw=True)

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3


In [128]:
result = left_df.join(right_df, how="inner")

#Rendering the df's in the same line. No need to learn this code

df1_styler = left_df.style.set_table_attributes("style='display:inline'").set_caption('Left table')
df2_styler = right_df.style.set_table_attributes("style='display:inline'").set_caption('Right table')
df1_df2_merged = result.style.set_table_attributes("style='display:inline'").set_caption('JOIN')

display_html(df1_styler._repr_html_()+ df2_styler._repr_html_() + df1_df2_merged._repr_html_(), raw=True)
#display_html(df1_styler._repr_html_()+ df2_styler._repr_html_(), raw=True)

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K2,A2,B2,C2,D2


<figure class="wp-block-table is-style-stripes"><table class="has-fixed-layout"><thead><tr><th><strong>Función de unión</strong></th><th class="has-text-align-center" data-align="center"><strong>join()</strong></th><th class="has-text-align-center" data-align="center"><strong>merge()</strong></th></tr></thead><tbody><tr><td>interior</td><td class="has-text-align-center" data-align="center">Sí</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>izquierda</td><td class="has-text-align-center" data-align="center">Sí</td><td class="has-text-align-center" data-align="center">Sí</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>derecha</td><td class="has-text-align-center" data-align="center">Sí</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>exterior</td><td class="has-text-align-center" data-align="center">Sí</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>cruz</td><td class="has-text-align-center" data-align="center">X</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>Unión en índices</td><td class="has-text-align-center" data-align="center">Sí</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>Unirse en columnas</td><td class="has-text-align-center" data-align="center">X</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>A la izquierda en la columna, a la derecha en el índice</td><td class="has-text-align-center" data-align="center">Sí</td><td class="has-text-align-center" data-align="center">Sí</td></tr><tr><td>A la izquierda en el índice, a la derecha en la columna</td><td class="has-text-align-center" data-align="center">X</td><td class="has-text-align-center" data-align="center">Sí</td></tr></tbody></table>

## Métodos habituales de Pandas
```python
df.head() # imprime la cabecera, por defecto 5 filas
df.tail() # establece la cola, por defecto 5 filas
df.describe() # descripción estadística
df.info() # información del df
df.columns # muestra la columna
df.index # muestra el índice
df.dtypes # muestra los tipos de datos de la columna
df.plot() # hace un gráfico
df.hist() # hace un histograma
df.col.value_counts() # cuenta los valores únicos de una columna
df.col.unique() # devuelve valores únicos de una columna
df.copy() # copia el df
df.drop() # elimina columnas o filas (axis=0,1)
df.dropna() # elimina nulos
df.fillna() # rellena nulos
df.shape # dimensiones del df
df._get_numeric_data() # selecciona numérico columnas
df.rename() # renombrar columnas
df.str.replace() # reemplazar columnas de cadenas
df.astype(dtype='float32') # cambiar el tipo de datos
df.iloc[] # localizar por índice
df.loc[] # localizar por elemento
df.transpose() # transpone el df
df.T
df.sample(n, frac) # muestra de df
df.col.sum() # suma de una columna
df.col.max() # máximo de una columna
df.col.min() # mínimo de una columna
df[col] # seleccionar columna
df.col
df.isnull() # valores nulos
df.isna()
df.notna() # valores no nulos
df.drop_duplicates() # eliminar duplicados
df.reset_index(inplace=True) # restablecer el índice y sobrescribir
```

## Materiales adicionales

* [¡Lea la documentación!](https://pandas.pydata.org/pandas-docs/stable/index.html)
* [Hoja de trucos](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)
* [Ejercicios para practicar](https://github.com/guipsamora/pandas_exercises)
* [Más información sobre fusión, concatenación y unión](https://realpython.com/pandas-merge-join-and-concat/#pandas-join-combining-data-on-a-column-or-index). Y [¡aún más!](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html)