In [1]:
import pandas as pd
import numpy as np

# Introduction to Pandas

Pandas is a high level data manipulation package built on top of Numpy


# Series
A Series is a one dimensional array with labels (an index)

In [3]:
# You will use libraries a lot. As yo ucan see from the top of this doc, abbreviate your imports to save time
# you can also import portions of a library to save processing time and prevent wrong syntax
from pandas import Series, DataFrame

In [4]:
# Creating a Series from a list
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [5]:
# We can access differnt components separately

# Accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
# Accessing values
x.values

array([10, 20, 30, 40, 50])

In [7]:
# Accessing the dtype
# A series is an ndarray, thus homogenous and cannot store multiple dtypes
x.dtype

dtype('int64')

In [9]:
# Creating a Series with an Index
data = [450,650,870]
Sales = Series(data, index=["Don", "Mike", "Edwin"])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [10]:
# Check type
type(Sales)

pandas.core.series.Series

In [12]:
# if we check the index of Sales we will get the values rather than the range. which is a string
Sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

# Accessing Values

In [13]:
# You can access values using the index name
Sales["Don"]

np.int64(450)

In [44]:
# you can still use traditional indexing 
Sales.iloc[0] # You don't have to use .iloc

np.int64(450)

In [40]:
# You can filter based on conditions which will return bool values
Sales>500

Sales person
Don      False
Mike      True
Edwin     True
Name: Total tv sales, dtype: bool

In [20]:
# If we want to see the values
Sales[Sales>500]

Mike     650
Edwin    870
dtype: int64

In [21]:
# Checking the entries 
"Don" in Sales

True

In [43]:
# What about this?
450 in Sales
# 450 is not an index, it is a value. Which will return false

False

# What about Dictionaries?


In [23]:
# can convert a series to a dictionary
Sales_dict = Sales.to_dict()
Sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [24]:
# You can do the reverse
Sales_ser = Series(Sales_dict)
Sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

In [37]:
# We can create a new series from an existing series
# If we specify names in the index that were NOT there already, NaN values will be assigned
new_sales = Series(Sales, index=["Don","Mike","Sally", "Edwin", "Lucy"])

In [38]:
# To check for null values, use Pandas
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
Name: Total tv sales, dtype: bool

# Naming components in a Series

In [29]:
# Name an index
Sales.index.name = "Sales person"
Sales

Sales person
Don      450
Mike     650
Edwin    870
dtype: int64

In [30]:
# Nameing a Seies
Sales.name = "Total tv sales"
Sales

Sales person
Don      450
Mike     650
Edwin    870
Name: Total tv sales, dtype: int64

# DataFrames
DataFrames ar 2D, mutable, potentially hetergenous tabular data structures. This data structure contains two labelles axes (rows and columns)

## Creating a DataFrame (DF)

In [35]:
# Creating a DF from a list
data = [["Adrian", 20],["Bethany", 23],["Chloe",41]]

# When we create a DF, we can specify what the column names are and what the data type is
df = pd.DataFrame(data, columns=["Name", "Age"])

In [36]:
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


## DF from dict



In [57]:
# You will need a dict. Remember a dict has key:value pairs that are ordered, mutable and homogeneous
# create a dict 
members_dict = ({
    'Name': ['Richard', 'Alan', 'Zoe', 'Hannah'],
    'ID':   ['2501', '2502','2503','2504'],
    'Location' : ['London','Newcastle', 'Stoke','Liverpool'],
})


In [58]:
# Test to see that the data is in the dictionary
print(members_dict)

{'Name': ['Richard', 'Alan', 'Zoe', 'Hannah'], 'ID': ['2501', '2502', '2503', '2504'], 'Location': ['London', 'Newcastle', 'Stoke', 'Liverpool']}


In [52]:
# We can also check certain parameters
print(members_dict.get('ID'))

['2501', '2502', '2503', '2504']


In [54]:
# list of keys
print(list(members_dict.keys()))

['Name', 'ID', 'Location']


In [55]:
# list of tuples showing all key:value pairs
print(list(members_dict.items()))

[('Name', ['Richard', 'Alan', 'Zoe', 'Hannah']), ('ID', ['2501', '2502', '2503', '2504']), ('Location', ['London', 'Newcastle', 'Stoke', 'Liverpool'])]


In [61]:
# Making a df is done in one line. notice the (from_dict) syntax
df_members = pd.DataFrame.from_dict(members_dict)

In [62]:
df_members

Unnamed: 0,Name,ID,Location
0,Richard,2501,London
1,Alan,2502,Newcastle
2,Zoe,2503,Stoke
3,Hannah,2504,Liverpool


## Adding custom indexes


In [66]:
# changing the index requires the 'index' command
# If you want to change the index to be characters instead of numbers, you can use any of the following
df_members.set_index("ID", inplace = False)
# The argument passed through will be the column name which will become the index

Unnamed: 0_level_0,Name,Location
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2501,Richard,London
2502,Alan,Newcastle
2503,Zoe,Stoke
2504,Hannah,Liverpool


In [67]:
# becauses the inplace option is set to false the change is only reflected in the previous call. If "True" then it is permanent
df_members 

Unnamed: 0,Name,ID,Location
0,Richard,2501,London
1,Alan,2502,Newcastle
2,Zoe,2503,Stoke
3,Hannah,2504,Liverpool


## Create DF from a list of dicts

In [68]:
# Lets make a sample df 
list_of_dicts = [{'A': 'How', 'B':'to','C': 'create'},
                 {'A':'3','B': '2','C':'5'}]
list_of_dicts

[{'A': 'How', 'B': 'to', 'C': 'create'}, {'A': '3', 'B': '2', 'C': '5'}]

In [70]:
#use the syntax "from_records" or "from_dict"
df_LOD =pd.DataFrame.from_records(list_of_dicts)
print(df_LOD)

     A   B       C
0  How  to  create
1    3   2       5


 # Create DF from a Series
 Using the syntax like previous conversions, you can simply use 
 pd.Series(name_of_series) to extract a df from a series
 To add a series to a df you will use pd.concat(name_of_series) 
 