# Simple Data Manipulaition With Pandas

### Creating a dataframe from a dictionary

In [1]:
import pandas as pd
df = pd.DataFrame(data= {'name':['john', 'mary', 'peter','jeff','bill', 'lisa'], 'age':[23, 78, 22, 19, 45, 33],
                         'state': ['iowa', 'dc', 'california', 'texas', 'washington', 'dc'], 'num_children': [2, 2, 0, 1, 2, 1],
                        'num_pets' : [0, 4, 0, 5, 0, 0]})

### Listing out the headers/columns of a dataframe

In [2]:
df.columns

Index(['name', 'age', 'state', 'num_children', 'num_pets'], dtype='object')

### Slicing dataframe columns

In [3]:
df[['name', 'age', 'state']]

Unnamed: 0,name,age,state
0,john,23,iowa
1,mary,78,dc
2,peter,22,california
3,jeff,19,texas
4,bill,45,washington
5,lisa,33,dc


### Slcing dataframe rows

In [4]:
print(df.loc[2:4])
print('-------')
print(df.loc[2:4], 'name')
print('-------')
print(df.loc[3:4, ['name', 'age']])
print('---------')
print(df.loc[:2])

    name  age       state  num_children  num_pets
2  peter   22  california             0         0
3   jeff   19       texas             1         5
4   bill   45  washington             2         0
-------
    name  age       state  num_children  num_pets
2  peter   22  california             0         0
3   jeff   19       texas             1         5
4   bill   45  washington             2         0 name
-------
   name  age
3  jeff   19
4  bill   45
---------
    name  age       state  num_children  num_pets
0   john   23        iowa             2         0
1   mary   78          dc             2         4
2  peter   22  california             0         0


### Slicing columns and rows without names

In [5]:
print(df.iloc[:,0])
print('-------')
print(df.iloc[1])

0     john
1     mary
2    peter
3     jeff
4     bill
5     lisa
Name: name, dtype: object
-------
name            mary
age               78
state             dc
num_children       2
num_pets           4
Name: 1, dtype: object


### Filtered or conditional selection of dataframe

In [6]:
print(df[df['age'] > 30])
print('----OR----')
print(df[df.age > 30])

   name  age       state  num_children  num_pets
1  mary   78          dc             2         4
4  bill   45  washington             2         0
5  lisa   33          dc             1         0
----OR----
   name  age       state  num_children  num_pets
1  mary   78          dc             2         4
4  bill   45  washington             2         0
5  lisa   33          dc             1         0


### Slicing based on people with more pets than children

In [7]:
df[df['num_pets'] > df['num_children']]

Unnamed: 0,name,age,state,num_children,num_pets
1,mary,78,dc,2,4
3,jeff,19,texas,1,5


### Who is older than 40 and has pets. 

In [8]:
df[(df['num_pets']>0) & (df['age']>40)]

Unnamed: 0,name,age,state,num_children,num_pets
1,mary,78,dc,2,4


### You can find the number of rows in a dataframe by using the len function

In [9]:
print(len(df[(df['num_pets']>0) & (df['age']>40)]))
print('_ _ _')
len(df)

1
_ _ _


6

### You can use multiple operators for comarisions

In [10]:
df[(df['age']>40) | (df['num_pets']>0)]

Unnamed: 0,name,age,state,num_children,num_pets
1,mary,78,dc,2,4
3,jeff,19,texas,1,5
4,bill,45,washington,2,0


### You can also drop columns and rows of a dataframe without affecting the original dataframe. (axis=0 are rows while axis=1 are columns)

In [11]:
print(df.drop(['age', 'num_children'], axis=1))
print('--------')
print(df.drop([1, 4], axis=0))
print('--------')
print(df)

    name       state  num_pets
0   john        iowa         0
1   mary          dc         4
2  peter  california         0
3   jeff       texas         5
4   bill  washington         0
5   lisa          dc         0
--------
    name  age       state  num_children  num_pets
0   john   23        iowa             2         0
2  peter   22  california             0         0
3   jeff   19       texas             1         5
5   lisa   33          dc             1         0
--------
    name  age       state  num_children  num_pets
0   john   23        iowa             2         0
1   mary   78          dc             2         4
2  peter   22  california             0         0
3   jeff   19       texas             1         5
4   bill   45  washington             2         0
5   lisa   33          dc             1         0


### You can get more information about the data in dataframe

In [12]:
df.describe()

Unnamed: 0,age,num_children,num_pets
count,6.0,6.0,6.0
mean,36.666667,1.333333,1.5
std,22.384518,0.816497,2.345208
min,19.0,0.0,0.0
25%,22.25,1.0,0.0
50%,28.0,1.5,0.0
75%,42.0,2.0,3.0
max,78.0,2.0,5.0


### Finding the mean of multiple columns

In [13]:
print(len(df))
print('---AverageAge----')
print(df[['age', 'num_pets', 'num_children']].mean())
print('-------')

6
---AverageAge----
age             36.666667
num_pets         1.500000
num_children     1.333333
dtype: float64
-------


### Applying functions to multiple columns/rows

In [14]:
df[['age', 'num_pets', 'num_children']].apply(lambda col: sum(col), axis=0)

age             220
num_pets          9
num_children      8
dtype: int64

### Built in methods for calculations

In [15]:
df['age'].sum()

220

# Activity: Pandas vs. Python

In [16]:
import numpy as np
df = pd.read_csv('Titanic.csv')

FileNotFoundError: [Errno 2] File b'Titanic.csv' does not exist: b'Titanic.csv'

### Finding average female age not using pandas

In [None]:
ls_age = df['Age']
ls_gender = df['Sex']

def female_average_age(ls_age, ls_gender):
    female_ages = [age for age, gender in zip(ls_age, ls_gender) if not np.isnan(age) and gender == 'female']
    return sum(age for age in female_ages)/len(female_ages)
    
female_average_age(ls_age, ls_gender)

### Finding average female age with pandas

In [None]:
print(df[(df['Sex']=='female')]['Age'].mean())

# Pandas Manipulation

In [None]:
df = pd.DataFrame(data= {'name':['john', 'mary', 'peter','jeff','bill', 'lisa'], 'age':[23, 78, 22, 19, 45, 33],
                         'state': ['iowa', 'dc', 'california', 'texas', 'washington', 'dc'], 'num_children': [2, 2, 0, 1, 2, 1],
                        'num_pets' : [0, 4, 0, 5, 0, 0]})

### Serise vs. dataframe (serise are like list)

In [None]:
print(type(df[['age']]))
print(df[['age']])
print('-------')
print(type(df['age']))
print(df['age'])

### Changing vdataframe value using pandas

In [None]:
df[['age']].apply(lambda val: val*2)

### Multiplying columns and creaing new columns

In [None]:
df['new_col'] = df['age']*df['num_children']

### Sorting Dataframe by col values

In [None]:
df.sort_values('age', ascending= True)

### Select rows based on begining with j

In [None]:
df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]

### Creating Dafaframes with tables

In [None]:
from collections import OrderedDict
from pandas import DataFrame
import pandas as pd
import numpy as np

table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item1', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  ['1$', '2$', '3$', '4$']),
    ('EU',   ['1€', '2€', '3€', '4€'])
))
d = DataFrame(table)

### Creaitng pivot tables from dataframes

In [None]:
p = d.pivot(index='Item', columns='CType', values='USD')