In [1]:
person = {
    "first" : "Corey",
    "last" : "Schafer",
    "email" : "CoreyMSchafer@gmail.com"
}

In [2]:
people = {
    "first" : ["Corey"],
    "last" : ["Schafer"],
    "email" : ["CoreyMSchafer@gmail.com"]
}

In [3]:
people = {
    "first" : ["Corey","Jane", "John"],
    "last" : ["Schafer", "Doe", "Doe"],
    "email" : ["CoreyMSchafer@gmail.com","JaneDoe@email.com", "JohnDoe@email.com"]
}

In [4]:
people["email"]

['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']

In [5]:
import pandas as pd
df = pd.DataFrame(people)
df
df_original = df

In [6]:
df["first"] #Returns as series

0    Corey
1     Jane
2     John
Name: first, dtype: object

In [7]:
df.email

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [8]:
df[["first", "last"]]

Unnamed: 0,first,last
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [9]:
df[["first"]] #Returns as DataFrame

Unnamed: 0,first
0,Corey
1,Jane
2,John


In [10]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [11]:
df.iloc[[0 , 1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [12]:
df.iloc[:,[2]]

Unnamed: 0,email
0,CoreyMSchafer@gmail.com
1,JaneDoe@email.com
2,JohnDoe@email.com


In [13]:
df.loc[[0,1], ["email"]]      #For now looks the same as .iloc[]

Unnamed: 0,email
0,CoreyMSchafer@gmail.com
1,JaneDoe@email.com


In [14]:
df.set_index('email',inplace=True)         #Set index

In [15]:
df                       #Doesn't change, must use 'inplace' argument

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [16]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [17]:
df.loc['CoreyMSchafer@gmail.com', 'last']

'Schafer'

In [18]:
df.iloc[0]                   #.iloc still works, .loc does not

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

In [19]:
df.reset_index(inplace = True)           #Resets index to numerical values

In [20]:
df['last'] == 'Doe'

0    False
1     True
2     True
Name: last, dtype: bool

In [21]:
df[['last']] =='Doe'                   #Displays as dataframe due to double brackets

Unnamed: 0,last
0,False
1,True
2,True


In [22]:
filt = (df['last'] == 'Schafer') | (df['first'] == 'John')      #Parenthesis is just for ease of reading

In [23]:
df.loc[filt, 'email']              #Same thing as above. Can grab specific columns from return too!

0    CoreyMSchafer@gmail.com
2          JohnDoe@email.com
Name: email, dtype: object

In [24]:
df.loc[~filt, 'email']            #Tilde means 'does not match'

1    JaneDoe@email.com
Name: email, dtype: object

In [25]:
df.columns = ['email_address','first_name', 'last_name', ]       #Change column names

In [26]:
df.columns = [x.upper() for x in df.columns]
df

Unnamed: 0,EMAIL_ADDRESS,FIRST_NAME,LAST_NAME
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [27]:
df.columns = df.columns.str.replace('_' ,  ' ')
df

Unnamed: 0,EMAIL ADDRESS,FIRST NAME,LAST NAME
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [28]:
df.rename(columns = {'FIRST NAME' : 'first', 'LAST NAME' : 'last', 'EMAIL ADDRESS' : 'email'}, inplace = True)
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [29]:
df.loc[2] = ['JohnSmith@email.com','John', 'Smith'] 
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnSmith@email.com,John,Smith


In [30]:
df.loc[2,['last', 'email']] = ['Doe', 'JohnDoe@email.com']
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [31]:
df.loc[2, 'last'] = 'Smith'                   #can also use '.at[]' as well
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Smith


In [32]:
filt = (df['email'] == 'JohnDoe@email.com')               #Throws an error
df[filt]['last'] = 'Smith'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [33]:
filt = (df['email'] == 'JohnDoe@email.com')               #Alternative way using '.loc' that works
df.loc[filt,'last'] = 'Smith'
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Smith


In [34]:
df['email'].str.lower()      #Returns series

0    coreymschafer@gmail.com
1          janedoe@email.com
2          johndoe@email.com
Name: email, dtype: object

In [35]:
df['email'] = df['email'].str.lower()        #Assigns to column
df

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Corey,Schafer
1,janedoe@email.com,Jane,Doe
2,johndoe@email.com,John,Smith


In [36]:
df['email'].apply(len)                      #You can pass functions/methods to '.apply'

0    23
1    17
2    17
Name: email, dtype: int64

In [37]:
def my_func(email):
    return email.upper()

In [38]:
df['email'].apply(my_func)                 #Use '.apply' and pass function

0    COREYMSCHAFER@GMAIL.COM
1          JANEDOE@EMAIL.COM
2          JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [39]:
df['email'] = df['email'].apply(my_func)
df

Unnamed: 0,email,first,last
0,COREYMSCHAFER@GMAIL.COM,Corey,Schafer
1,JANEDOE@EMAIL.COM,Jane,Doe
2,JOHNDOE@EMAIL.COM,John,Smith


In [40]:
df['email'] = df['email'].apply(lambda x: x.lower())
df

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Corey,Schafer
1,janedoe@email.com,Jane,Doe
2,johndoe@email.com,John,Smith


In [41]:
df.apply(len)                             #Note that '.apply' only acts on Series and not DataFrames

email    3
first    3
last     3
dtype: int64

In [42]:
df.apply(pd.Series.min)

email    coreymschafer@gmail.com
first                      Corey
last                         Doe
dtype: object

In [43]:
df.apply(lambda x: x.min())                   #Same thing but with a lambda function

email    coreymschafer@gmail.com
first                      Corey
last                         Doe
dtype: object

In [44]:
df.applymap(len)                            #Works on whole DataFrame.

Unnamed: 0,email,first,last
0,23,5,7
1,17,4,3
2,17,4,5


In [45]:
df.applymap(str.lower)

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,corey,schafer
1,janedoe@email.com,jane,doe
2,johndoe@email.com,john,smith


In [46]:
df['first'].map({'Corey' : 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

In [47]:
df['first'].replace({'Corey' : 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2     John
Name: first, dtype: object

In [48]:
df['first'] = df['first'].replace({'Corey' : 'Chris', 'Jane': 'Mary'})
df

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Chris,Schafer
1,janedoe@email.com,Mary,Doe
2,johndoe@email.com,John,Smith


In [49]:
df = df_original
df

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Chris,Schafer
1,janedoe@email.com,Mary,Doe
2,johndoe@email.com,John,Smith


In [50]:
df['first'] + ' ' + df['last']

0    Chris Schafer
1         Mary Doe
2       John Smith
dtype: object

In [51]:
df['full_name'] = df['first'] + ' ' + df['last']
df

Unnamed: 0,email,first,last,full_name
0,coreymschafer@gmail.com,Chris,Schafer,Chris Schafer
1,janedoe@email.com,Mary,Doe,Mary Doe
2,johndoe@email.com,John,Smith,John Smith


In [52]:
df.drop(columns = ['first','last'], inplace = True)
df

Unnamed: 0,email,full_name
0,coreymschafer@gmail.com,Chris Schafer
1,janedoe@email.com,Mary Doe
2,johndoe@email.com,John Smith


In [53]:
df['full_name'].str.split(' ', expand = True)

Unnamed: 0,0,1
0,Chris,Schafer
1,Mary,Doe
2,John,Smith


In [54]:
df[['first' , 'last']] = df['full_name'].str.split(' ', expand = True)
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith


In [55]:
df.append({'first' : 'Tony'}, ignore_index = True)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith
3,,,Tony,


In [56]:
people = {
    'first' : ['Tony' , 'Steve'],
    'last' : ['Stark' , 'Rogers'],
    'email' : ['IronMan@avenge.com' , 'Cap@avenge.com']
}

df2 = pd.DataFrame(people)
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@avenge.com


In [57]:
df.append(df2, ignore_index = True)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [58]:
df = df.append(df2, ignore_index = True)
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [59]:
df.drop(index = 4)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark


In [60]:
filt = df['last'] == 'Doe'
df.drop(index = df[filt].index, inplace = True)
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


# Reset DataSeries - Lesson 7

In [61]:
people = {
    "first" : ["Corey","Jane", "John","Adam"],
    "last" : ["Schafer", "Doe", "Doe", "Doe"],
    "email" : ["CoreyMSchafer@gmail.com","JaneDoe@email.com", "JohnDoe@email.com", "AdamDoe@email.com"]
}
df = pd.DataFrame(people)

In [62]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,AdamDoe@email.com


In [63]:
df.sort_values(by = 'last', ascending = True)

Unnamed: 0,first,last,email
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,AdamDoe@email.com
0,Corey,Schafer,CoreyMSchafer@gmail.com


In [64]:
df.sort_values(by = ['last', 'first'], ascending = [True,True], inplace = True)   
df

Unnamed: 0,first,last,email
3,Adam,Doe,AdamDoe@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
0,Corey,Schafer,CoreyMSchafer@gmail.com


In [65]:
df.sort_index()                           #Resets to order that we added names

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,AdamDoe@email.com


In [66]:
df['last'].sort_values()

3        Doe
1        Doe
2        Doe
0    Schafer
Name: last, dtype: object

# Cleaning Data

In [67]:
import numpy as np
people = {
    "first" : ["Corey","Jane", "John","Chris", np.nan, None, 'NA'],
    "last" : ["Schafer", "Doe", "Doe", "Schafer", np.nan, np.nan, "Missing"],
    "email" : ["CoreyMSchafer@gmail.com","JaneDoe@email.com", "JohnDoe@email.com", None, np.nan, "Anonymous@email.com", 'NA'],
    "age" : ['33', '55', '63', '36', None, None, 'Missing']
}


In [68]:
df = pd.DataFrame(people)
df.replace('NA', np.nan, inplace = True)
df.replace('Missing', np.nan, inplace = True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [69]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [70]:
df.dropna(axis = 'index', how = 'all', subset = ['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [71]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [72]:
df.fillna(0)                         #Useful for numeric data

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [73]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [74]:
type(np.nan)

float

In [75]:
df['age'] = df['age'].astype(float)

In [76]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [77]:
df['age'].mean()

46.75

# Reading and writing to different sources
