# Pandas Tutorial 2

In [1]:
import pandas as pd
import numpy as np

# to print the 'df' in a table format
from IPython.display import display

## Incomplete Data

In [2]:
# Create Dummy data
dummy_data = {'A': [1,2,3], 'B':[4,np.nan,np.nan], 'C': [7,8,9], 'D': [10,11,12]}
df2 = pd.DataFrame(dummy_data)
display(df2)

# Drop rows with null data
df_new = df2.dropna()
display(df_new)

# Drop columns with null data
df_new = df2.dropna(axis=1)
display(df_new)

# Drop rows/cols with more than one null (Set Threshold)
df_new = df2.dropna(thresh=2, axis=1)
display(df_new)

# Replace nulls with fillna()
df_new = df2.fillna(value='NEW')
display(df_new)

# Use math function
df_new = df2.fillna(value=df2['C'].mean())
# sum(), max(), min()
display(df_new)

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,,8,11
2,3,,9,12


Unnamed: 0,A,B,C,D
0,1,4.0,7,10


Unnamed: 0,A,C,D
0,1,7,10
1,2,8,11
2,3,9,12


Unnamed: 0,A,C,D
0,1,7,10
1,2,8,11
2,3,9,12


Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,NEW,8,11
2,3,NEW,9,12


Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,8.0,8,11
2,3,8.0,9,12


## group_by

In [3]:
data = {
    'Company': ['Apple', 'Google', 'Meta', 'Apple', 'Google', 'Meta'],
    'Employees': ['John', 'James', 'Dave', 'Hari', 'Ram', 'Sam'],
    'Salary': [200, 220, 100, 120, 150, 180]
}

# Create dataframe
df1 = pd.DataFrame(data)
display(df1)

Unnamed: 0,Company,Employees,Salary
0,Apple,John,200
1,Google,James,220
2,Meta,Dave,100
3,Apple,Hari,120
4,Google,Ram,150
5,Meta,Sam,180


In [4]:
# Group By Company - To get object location in memory
company = df1.groupby('Company')
print(company, '\n')

# sum, mean, max, min, count
display(company.count())
display(company.sum())
display(company.max())

# describe
display(company.describe())

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020226CF9250> 



Unnamed: 0_level_0,Employees,Salary
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,2,2
Google,2,2
Meta,2,2


Unnamed: 0_level_0,Employees,Salary
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,JohnHari,320
Google,JamesRam,370
Meta,DaveSam,280


Unnamed: 0_level_0,Employees,Salary
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,John,200
Google,Ram,220
Meta,Sam,180


Unnamed: 0_level_0,Salary,Salary,Salary,Salary,Salary,Salary,Salary,Salary
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Apple,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
Google,2.0,185.0,49.497475,150.0,167.5,185.0,202.5,220.0
Meta,2.0,140.0,56.568542,100.0,120.0,140.0,160.0,180.0


## apply

In [5]:
# Create Function
def times1000(x):
    return format(x*1000, ",d") 

# Use Apply Function
df1['Salary'].apply(times1000)
# Make it into a DataFrame
display(pd.DataFrame(df1['Salary'].apply(times1000)))

# Use Lambda
df1['Salary'] = df1['Salary'].apply(lambda x: format(x*1000, ",d"))
df1

Unnamed: 0,Salary
0,200000
1,220000
2,100000
3,120000
4,150000
5,180000


Unnamed: 0,Company,Employees,Salary
0,Apple,John,200000
1,Google,James,220000
2,Meta,Dave,100000
3,Apple,Hari,120000
4,Google,Ram,150000
5,Meta,Sam,180000


## Sorting


In [6]:
# Sort salary from lowest to highest
display(df1.sort_values('Salary'))
        
# Sort salary from highest to lowest
display(df1.sort_values('Salary', ascending=False))

# To make the change permanenet, use inplace=True
df1.sort_values('Employees', inplace=True)
display(df1)

Unnamed: 0,Company,Employees,Salary
2,Meta,Dave,100000
3,Apple,Hari,120000
4,Google,Ram,150000
5,Meta,Sam,180000
0,Apple,John,200000
1,Google,James,220000


Unnamed: 0,Company,Employees,Salary
1,Google,James,220000
0,Apple,John,200000
5,Meta,Sam,180000
4,Google,Ram,150000
3,Apple,Hari,120000
2,Meta,Dave,100000


Unnamed: 0,Company,Employees,Salary
2,Meta,Dave,100000
3,Apple,Hari,120000
1,Google,James,220000
0,Apple,John,200000
4,Google,Ram,150000
5,Meta,Sam,180000
