In [1]:
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame(people)

In [4]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [5]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [6]:
df.columns = ['first_name', 'last_name', 'email']

In [7]:
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [8]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [9]:
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [10]:
df.loc[2, 'last']

'Doe'

In [11]:
df.loc[2, 'last'] = 'Smith'

In [12]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


In [13]:
df.at[2, 'last'] = 'Doe'
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [14]:
filt = (df['email']=='JohnDoe@email.com')
df.loc[filt, 'last'] = 'Smith'
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


In [15]:
df['email'] = df['email'].str.lower()

In [16]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


## apply()

In [17]:
df['email'].apply(len)

0    23
1    17
2    17
Name: email, dtype: int64

In [18]:
def update_email(email):
    return email.upper()

In [19]:
df['email'] = df['email'].apply(update_email)

In [20]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,COREYMSCHAFER@GMAIL.COM
1,Jane,Doe,JANEDOE@EMAIL.COM
2,John,Smith,JOHNDOE@EMAIL.COM


In [21]:
df['email'] = df['email'].apply(lambda x: x.lower())

In [22]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [23]:
df.apply(pd.Series.min)

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

## applymap()

In [24]:
df.applymap(len)

  df.applymap(len)


Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,5,17


In [25]:
df.applymap(str.upper)

  df.applymap(str.upper)


Unnamed: 0,first,last,email
0,COREY,SCHAFER,COREYMSCHAFER@GMAIL.COM
1,JANE,DOE,JANEDOE@EMAIL.COM
2,JOHN,SMITH,JOHNDOE@EMAIL.COM


## map()

In [26]:
df['first'].map({'Corey': 'Hitesh', 'Jane': 'JannyB'})


0    Hitesh
1    JannyB
2       NaN
Name: first, dtype: object

## replace()

In [27]:
df['first'].replace({'Corey': 'Hitesh', 'Jane': 'JannyB'})

0    Hitesh
1    JannyB
2      John
Name: first, dtype: object

In [28]:
df['first'] + ' ' + df['last']

0    Corey Schafer
1         Jane Doe
2       John Smith
dtype: object

In [29]:
df['full_name'] = df['first'] + ' ' + df['last']

In [30]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,coreymschafer@gmail.com,Corey Schafer
1,Jane,Doe,janedoe@email.com,Jane Doe
2,John,Smith,johndoe@email.com,John Smith


In [31]:
df.drop(columns=['first', 'last'], inplace=True)

In [32]:
df


Unnamed: 0,email,full_name
0,coreymschafer@gmail.com,Corey Schafer
1,janedoe@email.com,Jane Doe
2,johndoe@email.com,John Smith


In [33]:
df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Smith


In [34]:
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)

In [35]:
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Smith,John,Smith


In [36]:
new_row = pd.DataFrame([{'first': 'Tony'}])

# Append the new row using pd.concat
df = pd.concat([df, new_row], ignore_index=True)

In [37]:
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Smith,John,Smith
3,,,Tony,


In [38]:
people = {
    "first": ["Tony", 'Steve'], 
    "last": ["Stark", 'Rogers'], 
    "email": ["stark@gmail.com", 'rogers@email.com']
}

df2 = pd.DataFrame(people)

In [39]:
df2

Unnamed: 0,first,last,email
0,Tony,Stark,stark@gmail.com
1,Steve,Rogers,rogers@email.com


In [40]:
pd.concat([df, df2], ignore_index=True)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Smith,John,Smith
3,,,Tony,
4,stark@gmail.com,,Tony,Stark
5,rogers@email.com,,Steve,Rogers


In [41]:
df = pd.concat([df, df2], ignore_index=True)

In [42]:
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Smith,John,Smith
3,,,Tony,
4,stark@gmail.com,,Tony,Stark
5,rogers@email.com,,Steve,Rogers


In [43]:
df.drop(index=3)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Smith,John,Smith
4,stark@gmail.com,,Tony,Stark
5,rogers@email.com,,Steve,Rogers


In [44]:
df.drop(index=df[df['last']=='Doe'].index)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
2,johndoe@email.com,John Smith,John,Smith
3,,,Tony,
4,stark@gmail.com,,Tony,Stark
5,rogers@email.com,,Steve,Rogers


In [45]:
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Smith,John,Smith
3,,,Tony,
4,stark@gmail.com,,Tony,Stark
5,rogers@email.com,,Steve,Rogers


In [46]:
people = {
    "first": ["Corey", 'Jane', 'John', 'Adam'], 
    "last": ["Schafer", 'Doe', 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com', 'A@gmail.com']
}

In [47]:
people = {
    "first": ["Corey", 'Jane', 'John', 'Adam'], 
    "last": ["Schafer", 'Doe', 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com', 'A@gmail.com']
}

df = pd.DataFrame(people)
df.head()

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,A@gmail.com


In [48]:
# df.sort_values(by='last')
df.sort_values(by='last', ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,A@gmail.com


In [49]:
df.sort_values(by=['last', 'first'], ascending=False)


Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
2,John,Doe,JohnDoe@email.com
1,Jane,Doe,JaneDoe@email.com
3,Adam,Doe,A@gmail.com


In [50]:
df.sort_values(by=['last', 'first'], ascending=[False, True], inplace=True)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
3,Adam,Doe,A@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [51]:
df['last'].sort_values()

3        Doe
1        Doe
2        Doe
0    Schafer
Name: last, dtype: object

### Grouping and Aggregating Data

In [52]:
df.head()

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
3,Adam,Doe,A@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


### Cleaning DataSets

In [54]:
import numpy as np

In [69]:
people = {
    "first": ["Corey", 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    "last": ["Schafer", 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@gmail.com', 'NA'],
    "age": ['33', '55', '63', '36', None , None, 'Missing']
}

df = pd.DataFrame(people)

df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@gmail.com,
6,,,,


In [70]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [71]:
df.dropna(axis='index', how='all', subset=['first','email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@gmail.com,


In [72]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [73]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@gmail.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [74]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [75]:
type(np.nan)

float

In [76]:
#df['age'] = df['age'].astype(int)
df['age'] = df['age'].astype(float)

In [78]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [80]:
df['age'].mean()

46.75