# Pandas
Panel - Data
- Load
- Prepare  
- Model
- Manipulate
- Analyze

### DataFrames
Are Two-dimensional tagged data structures

Has 3 components: 
- data
- index (Rows)
- columns

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = np.array([['', "col1", "col2"], ["row1",11,22],["row2",33,44]])

In [3]:
df = pd.DataFrame(data=data[1:,1:], index=data[1:,0], columns=data[0,1:])

In [4]:
print(df)
print(df.shape)

     col1 col2
row1   11   22
row2   33   44
(2, 2)


In [5]:
df2 = pd.DataFrame(np.array([[1,2,3],[4,5,6],[7,8,9]]))
print(df2)
print(df2.shape)

   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9
(3, 3)


In [6]:
series = pd.Series({"Argentina": "Buenos Aires", "Chile": "Santiago de Chile", "Colombia": "Bogotá", "Peru":"Lima", "Venezuela":"Caracas", "Mexico":"Mexico D.F."})

In [7]:
print(series)
print(series.shape)

Argentina         Buenos Aires
Chile        Santiago de Chile
Colombia                Bogotá
Peru                      Lima
Venezuela              Caracas
Mexico             Mexico D.F.
dtype: object
(6,)


In [8]:
series2 = pd.Series({"Argentina": ["Buenos Aires", "Rosario", "La Plata"], "Chile": "Santiago de Chile", "Colombia": ["Bogotá", "Medellin", "Cali"], "Peru":"Lima", "Venezuela":"Caracas", "Mexico":"Mexico D.F."})

In [9]:
print(series2)
print(series2.shape)

Argentina    [Buenos Aires, Rosario, La Plata]
Chile                        Santiago de Chile
Colombia              [Bogotá, Medellin, Cali]
Peru                                      Lima
Venezuela                              Caracas
Mexico                             Mexico D.F.
dtype: object
(6,)


In [10]:
df3 = pd.DataFrame(np.array([[1,2,3,4,5], [6,7,8,9,10],[11,12,13,14,15],[16,17,18,19,20],[21,22,23,24,25]]))

In [11]:
print(len(df3.index))
print(len(df3.columns))

5
5


In [12]:
df3.describe()

Unnamed: 0,0,1,2,3,4
count,5.0,5.0,5.0,5.0,5.0
mean,11.0,12.0,13.0,14.0,15.0
std,7.905694,7.905694,7.905694,7.905694,7.905694
min,1.0,2.0,3.0,4.0,5.0
25%,6.0,7.0,8.0,9.0,10.0
50%,11.0,12.0,13.0,14.0,15.0
75%,16.0,17.0,18.0,19.0,20.0
max,21.0,22.0,23.0,24.0,25.0


In [13]:
df4 = pd.DataFrame(np.random.randint(0,50,size=(100,5)))

In [14]:
df4.describe()

Unnamed: 0,0,1,2,3,4
count,100.0,100.0,100.0,100.0,100.0
mean,24.88,22.62,24.07,25.72,24.27
std,14.637216,14.099774,13.687165,15.43581,14.967881
min,0.0,0.0,0.0,0.0,0.0
25%,11.75,11.0,13.0,10.75,10.0
50%,26.0,21.0,24.5,27.5,26.0
75%,38.25,33.0,34.0,39.0,37.25
max,48.0,49.0,49.0,49.0,49.0


In [15]:
df4.corr()

Unnamed: 0,0,1,2,3,4
0,1.0,-0.00267,0.00574,-0.129309,0.1262
1,-0.00267,1.0,-0.032155,0.111589,-0.012575
2,0.00574,-0.032155,1.0,0.063538,-0.091751
3,-0.129309,0.111589,0.063538,1.0,-0.157191
4,0.1262,-0.012575,-0.091751,-0.157191,1.0


In [16]:
df4.count() # Count of not null elements

0    100
1    100
2    100
3    100
4    100
dtype: int64

In [17]:
df4.max() # Max Value

0    48
1    49
2    49
3    49
4    49
dtype: int32

In [18]:
df4.min() # Min value

0    0
1    0
2    0
3    0
4    0
dtype: int32

In [19]:
df4.mean() # Mean of all data

0    24.88
1    22.62
2    24.07
3    25.72
4    24.27
dtype: float64

In [20]:
df4.std() # Standard Deviation

0    14.637216
1    14.099774
2    13.687165
3    15.435810
4    14.967881
dtype: float64

In [21]:
df4.median() # Middle Value

0    26.0
1    21.0
2    24.5
3    27.5
4    26.0
dtype: float64

In [22]:
df4[0] # Column

0     21
1      5
2     31
3      2
4     42
      ..
95    31
96    48
97    36
98    17
99    41
Name: 0, Length: 100, dtype: int32

In [23]:
df4[[0,2]] # Two Columns

Unnamed: 0,0,2
0,21,32
1,5,20
2,31,3
3,2,39
4,42,23
...,...,...
95,31,24
96,48,31
97,36,13
98,17,32


In [24]:
df4.iloc[0][2] # same as M[0,2] Row 0 column 2

32

In [25]:
df4.loc[0] # Row 0 all columns

0    21
1    49
2    32
3    38
4    29
Name: 0, dtype: int32

In [26]:
df4.iloc[0,:] # Row 0 all 

0    21
1    49
2    32
3    38
4    29
Name: 0, dtype: int32

In [27]:
df4.iloc[0,:] = [None,None,None,None,None]

In [28]:
df4.isnull()

Unnamed: 0,0,1,2,3,4
0,True,True,True,True,True
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
95,False,False,False,False,False
96,False,False,False,False,False
97,False,False,False,False,False
98,False,False,False,False,False


In [29]:
df4.isnull().sum()

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [30]:
# pd.dropna() # Row

In [31]:
# df4.dropna(axis=1) # Columns

In [32]:
df4.fillna(0)

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,0.0
1,5.0,25.0,20.0,5.0,40.0
2,31.0,3.0,3.0,19.0,45.0
3,2.0,36.0,39.0,21.0,10.0
4,42.0,26.0,23.0,1.0,37.0
...,...,...,...,...,...
95,31.0,5.0,24.0,24.0,31.0
96,48.0,13.0,31.0,26.0,24.0
97,36.0,33.0,13.0,47.0,14.0
98,17.0,43.0,32.0,21.0,41.0
