## Missing data in Pandas

Using pandas , missing point is automatically filled with <br>
NaN or null .

In [3]:
import numpy as np
import pandas as pd

In [5]:
d = { 'A':[1,2,np.bnan] , 'B': [5,np.nan,np.nan] , 'C' : [1,2,3] }

In [6]:
df = pd.DataFrame(d)

In [7]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [8]:
df.dropna() #CHECK DOCSTRING for every feature

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [12]:
df.dropna(thresh = 2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [15]:
df.dropna(axis = 1,subset = [2])

Unnamed: 0,C
0,1
1,2
2,3


fillna() - replace missing values

In [16]:
df.fillna('FILL')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL,2
2,FILL,FILL,3


In [17]:
df['A'].fillna(value = df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

## GroupBy
Allows to group rows together based of a column and perform <br>
some aggregate function(inputs many values-outputs single).

In [20]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],\
    'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],\
    'Sales':[200,120,340,124,243,350] }

In [33]:
data = pd.DataFrame(data=data)
data.index.names = ['Sno']
data

Unnamed: 0_level_0,Company,Person,Sales
Sno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [34]:
data.groupby('Company')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000207C1D9BAC8>

In [37]:
byComp = data.groupby('Company')
byComp.mean() #AUTOMATICALLY IGNORES A NON-NUMBERIC COLUMN

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [43]:
byComp.sum()  # ACTUAL DATAFRAME IN ITSELF

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,593
GOOG,320
MSFT,464


In [40]:
byComp.sum().loc['FB']

Sales    593
Name: FB, dtype: int64

In [41]:
# OR
data.groupby('Company').sum().loc['MSFT']

Sales    464
Name: MSFT, dtype: int64

In [44]:
data.groupby('Company').count() # INCLUDES ALL COLUMNS

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [53]:
data.groupby('Company').max() #HERE Person IS COMPARED
                            # BASED ON ALPHABETIC , NOT SALES

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Sarah,350
GOOG,Sam,200
MSFT,Vanessa,340


In [51]:
byComp.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [55]:
byComp.describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


## Merging Joining and Concatenating

3 Main ways of combining DataFrames.
    <ul> <li>Merging</li><li>Joining</li><li>Concatenating</li>
    </ul>

In [60]:
df1 = pd.DataFrame\
(data = {'A':['A0','A1','A2'],\
         'B':['B0','B1','B2'],\
         'C':['C0','C1','C2']},
index = [0,1,2])

In [61]:
df2 = pd.DataFrame\
(data = {'A':['A3','A4','A5'],\
         'B':['B3','B4','B5'],\
         'C':['C3','C4','C5']},
index = [3,4,5])

In [62]:
df3 = pd.DataFrame\
(data = {'A':['A6','A7','A8'],\
         'B':['B6','B7','B8'],\
         'C':['C6','C7','C8']},
index = [6,7,8])

In [63]:
df1

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [64]:
df2

Unnamed: 0,A,B,C
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5


In [65]:
df3

Unnamed: 0,A,B,C
6,A6,B6,C6
7,A7,B7,C7
8,A8,B8,C8


concat method

In [66]:
pd.concat([df1,df2,df3]) # LIST OF DFs TO CONCAT

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5
6,A6,B6,C6
7,A7,B7,C7
8,A8,B8,C8


In [67]:
# DEFAULT CONCAT ACC TO ROWS(axis=0)
#FOR COLUMNS
pd.concat([df1,df2,df3],axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1,A.2,B.2,C.2
0,A0,B0,C0,,,,,,
1,A1,B1,C1,,,,,,
2,A2,B2,C2,,,,,,
3,,,,A3,B3,C3,,,
4,,,,A4,B4,C4,,,
5,,,,A5,B5,C5,,,
6,,,,,,,A6,B6,C6
7,,,,,,,A7,B7,C7
8,,,,,,,A8,B8,C8


Merging

In [68]:
left = pd.DataFrame\
(data = { 'key':['K0','K1','K2','K3'],\
          'A':[1,2,3,4],\
          'B':[5,6,7,8]\
        })

In [69]:
right = pd.DataFrame\
(data = { 'key':['K0','K1','K2','K3'],\
          'C':[11,12,13,14],\
          'D':[15,16,17,18]\
        })

In [70]:
left

Unnamed: 0,key,A,B
0,K0,1,5
1,K1,2,6
2,K2,3,7
3,K3,4,8


In [71]:
right

Unnamed: 0,key,C,D
0,K0,11,15
1,K1,12,16
2,K2,13,17
3,K3,14,18


In [73]:
pd.merge(left,right,how='inner',on='key')

Unnamed: 0,key,A,B,C,D
0,K0,1,5,11,15
1,K1,2,6,12,16
2,K2,3,7,13,17
3,K3,4,8,14,18
