In [2]:
import pandas as pd
from collections import OrderedDict
from datetime import datetime, date, timedelta

In [61]:
# create df, insert data from a list by rows
columns = ['name','year','reports']
values_by_columns = [['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
                     [2012, 2012, 2013, 2014, 2014], 
                     [4, 24, 31, 2, 3]]
values_by_rows = list(zip(*values_by_columns)) #list of tuples -> row oriented
print(values_by_rows)

index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma']
df = pd.DataFrame(data = values_by_rows, columns = columns, index = index)

[('Jason', 2012, 4), ('Molly', 2012, 24), ('Tina', 2013, 31), ('Jake', 2014, 2), ('Amy', 2014, 3)]


In [None]:
# some data cleaning method
import ast
print(
'''
When to use it.
ast.literal_eval(input()) would be useful if you expected a list (or something similar) by the user. 
For example '[1,2]' would be converted to [1,2].
''')

In [21]:
# index - > column
df1 = df.reset_index(drop=False, inplace=False) #level=0
df1 = df1.rename(columns={'index': 'lastname'})

# column -> index
df2 = df1.set_index('lastname', drop=True) # make lastname column into index also don't drop it
df2 = df2.rename(columns={'index': 'lastname'})

# drop index
df3 = df.reset_index(drop=True, inplace=False)

In [62]:
# insert a row by loc index
df.loc['April'] = ['Wang', 2013, 5]

# insert a row with pd.series
# insert a row with dict if you don't awant to pass the index name
row = pd.Series({'year':2013, 'name':'Zhu', 'reports':30}, name='Maggie')
df = df.append(row)

# # replace a row by iloc 
# df.iloc[2] = ['Test', 2014, 9]


# insert a column by list
df['age'] = [20, 31, 40, 31, 25, 11, 6]
df['test'] = ['test']*df.shape[0]

# insert a column by .insert()
df.insert(0, 'dept', 'Data Analytics')

In [71]:
# get column names
print(list(df))
print(list(df.columns))
pii = [col for col in list(df) if 'name' in col or 'age' in col]
print(pii)

['dept', 'name', 'year', 'reports', 'age', 'test']
['dept', 'name', 'year', 'reports', 'age', 'test']
['name', 'age']


In [100]:
# group by and sort
df_agg_df = df.groupby(['year'])['name'].count().reset_index()
df_agg_df.rename(columns={'name': 'number of people'}, inplace=True)
df_agg_sort = df_agg_df.sort_values(['number of people', 'year'], ascending=[False, True])
df_agg_sort

Unnamed: 0,year,number of people
1,2013,3
0,2012,2
2,2014,2


In [107]:
# group and unstack
# unstack: Unstack switches the rows to columns to get the activity counts as features.
df.groupby('year')['name'].value_counts().unstack().fillna('Nan')

name,Amy,Jake,Jason,Molly,Tina,Wang,Zhu
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012,Nan,Nan,1,1,Nan,Nan,Nan
2013,Nan,Nan,Nan,Nan,1,1,1
2014,1,1,Nan,Nan,Nan,Nan,Nan


In [74]:
# null
df.isnull().sum()

dept       0
name       0
year       0
reports    0
age        0
test       0
dtype: int64

In [108]:
df

Unnamed: 0,dept,name,year,reports,age,test
Cochice,Data Analytics,Jason,2012,4,20,test
Pima,Data Analytics,Molly,2012,24,31,test
Santa Cruz,Data Analytics,Tina,2013,31,40,test
Maricopa,Data Analytics,Jake,2014,2,31,test
Yuma,Data Analytics,Amy,2014,3,25,test
April,Data Analytics,Wang,2013,5,11,test
Maggie,Data Analytics,Zhu,2013,30,6,test


In [109]:
# duplicates -- return true or false
duplicates = df.duplicated(subset=['age'])

# drop duplicate rows
df.drop_duplicates()
# drop duplicaate and keep last one
df.drop_duplicates(keep='last')
# drop duplicate by a column name
df.drop_duplicates(['name'], keep='last')

In [113]:
bins = [0,10,20,30,40,50,60,70,80,90,100]
df['bin'] = pd.cut(df.age, bins)