# Explorando informações do DataFrame

In [None]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [None]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [None]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
iris_df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


# Lidando com dados ausentes (missing data)

In [None]:
import numpy as np

example1 = np.array([2, None, 6, 8])
example1

array([2, None, 6, 8], dtype=object)

In [None]:
example1.sum()

TypeError: ignored

In [None]:
example2 = np.array([2, np.nan, 6, 8])
example2.sum(), example2.min(), example2.max()

(nan, nan, nan)

In [None]:
example3 = pd.Series([0, np.nan, '', None])
example3.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [None]:
example3

0       0
1     NaN
2        
3    None
dtype: object

In [None]:
example3 = example3.dropna()
example3

0    0
2     
dtype: object

In [None]:
example4 = pd.DataFrame([[1,      np.nan, 7],
                         [2,      5,      8],
                         [np.nan, 6,      9]])
example4

Unnamed: 0,0,1,2
0,1.0,,7
1,2.0,5.0,8
2,,6.0,9


In [None]:
example6 = pd.DataFrame({'letters': ['A','B'] * 2 + ['B'],
                         'numbers': [1, 2, 1, 3, 3]})
example6

Unnamed: 0,letters,numbers
0,A,1
1,B,2
2,A,1
3,B,3
4,B,3


In [None]:
example6.duplicated()

0    False
1    False
2     True
3    False
4     True
dtype: bool

Merge

In [None]:
df1 = pd.DataFrame({'employee': ['Gary', 'Stu', 'Mary', 'Sue'],
                    'group': ['Accounting', 'Marketing', 'Marketing', 'HR']})
df1

Unnamed: 0,employee,group
0,Gary,Accounting
1,Stu,Marketing
2,Mary,Marketing
3,Sue,HR


In [None]:
df2 = pd.DataFrame({'employee': ['Mary', 'Stu', 'Gary', 'Sue'],
                    'hire_date': [2008, 2012, 2017, 2018]})
df2

Unnamed: 0,employee,hire_date
0,Mary,2008
1,Stu,2012
2,Gary,2017
3,Sue,2018


In [None]:
df3 = df1.merge(df2, left_on="employee", right_on="employee")
df3

Unnamed: 0,employee,group,hire_date
0,Gary,Accounting,2017
1,Stu,Marketing,2012
2,Mary,Marketing,2008
3,Sue,HR,2018


In [None]:
df4 = pd.merge(df1, df2)
df4

Unnamed: 0,employee,group,hire_date
0,Gary,Accounting,2017
1,Stu,Marketing,2012
2,Mary,Marketing,2008
3,Sue,HR,2018


Um para um

In [None]:
df5 = pd.DataFrame({'group': ['Accounting', 'Marketing', 'HR'],
                    'supervisor': ['Carlos', 'Giada', 'Stephanie']})
df5

Unnamed: 0,group,supervisor
0,Accounting,Carlos
1,Marketing,Giada
2,HR,Stephanie


In [None]:
df6 = df3.merge(df5, left_on="group", right_on="group")
df6

Unnamed: 0,employee,group,hire_date,supervisor
0,Gary,Accounting,2017,Carlos
1,Stu,Marketing,2012,Giada
2,Mary,Marketing,2008,Giada
3,Sue,HR,2018,Stephanie


Um para muitos

In [None]:
df7 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Marketing', 'Marketing', 'HR', 'HR'],
                    'core_skills': ['math', 'spreadsheets', 'writing', 'communication',
                               'spreadsheets', 'organization']})
df7

Unnamed: 0,group,core_skills
0,Accounting,math
1,Accounting,spreadsheets
2,Marketing,writing
3,Marketing,communication
4,HR,spreadsheets
5,HR,organization


Muitos para muitos

In [None]:
df8 = df3.merge(df7, on="group")
df8

Unnamed: 0,employee,group,hire_date,core_skills
0,Gary,Accounting,2017,math
1,Gary,Accounting,2017,spreadsheets
2,Stu,Marketing,2012,writing
3,Stu,Marketing,2012,communication
4,Mary,Marketing,2008,writing
5,Mary,Marketing,2008,communication
6,Sue,HR,2018,spreadsheets
7,Sue,HR,2018,organization


Concatenação em NumPy

In [None]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [None]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x])

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [None]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
ser1

1    a
2    b
3    c
dtype: object

In [None]:
ser2 = pd.Series(['d', 'e', 'f'], index=[4, 5, 6])
ser2

4    d
5    e
6    f
dtype: object

In [None]:
pd.concat([ser1, ser2])

1    a
2    b
3    c
4    d
5    e
6    f
dtype: object

In [None]:
df9 = pd.DataFrame({'A': ['a', 'c'],
                    'B': ['b', 'd']})
df9

Unnamed: 0,A,B
0,a,b
1,c,d


In [None]:
pd.concat([df9, df9])

Unnamed: 0,A,B
0,a,b
1,c,d
0,a,b
1,c,d


Evita a duplicata de index

In [None]:
pd.concat([df9, df9], ignore_index=True)

Unnamed: 0,A,B
0,a,b
1,c,d
2,a,b
3,c,d


Concatenar nas horizontal

In [None]:
pd.concat([df9, df9], axis=1)

Unnamed: 0,A,B,A.1,B.1
0,a,b,a,b
1,c,d,c,d


In [None]:
df10 = pd.DataFrame({'A': ['a', 'd'],
                     'B': ['b', 'e'],
                     'C': ['c', 'f']})
df10

Unnamed: 0,A,B,C
0,a,b,c
1,d,e,f


In [None]:
df11 = pd.DataFrame({'B': ['u', 'x'],
                     'C': ['v', 'y'],
                     'D': ['w', 'z']})
df11

Unnamed: 0,B,C,D
0,u,v,w
1,x,y,z


In [None]:
pd.concat([df10, df11])

Unnamed: 0,A,B,C,D
0,a,b,c,
1,d,e,f,
0,,u,v,w
1,,x,y,z


In [None]:
pd.concat([df10, df11], join='inner')

Unnamed: 0,B,C
0,b,c
1,e,f
0,u,v
1,x,y


Estatísticas exploratórias e visualização

In [None]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()
df = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
df['MEDV'] = boston_dataset.target

ImportError: ignored