# Pandas for Starfleet

In [1]:
import pandas as pd
import numpy as np


This tutorial is aimed at teaching all essential functions for Pandas.
The Numpy library module is the basic foundation for Pandas, so by learning Pandas you are also learning Numpy, although there are some differences between the two. 

## Pandas **Series** 
- a Series is a column in a dataset

In [2]:
pandaSeries = pd.Series([5,10,15,20,25,30])
pandaSeries

# index values 0 to 5

0     5
1    10
2    15
3    20
4    25
5    30
dtype: int64

In [3]:
# to get just the values for the Series
# this returns a Numpy array
pandaSeries.values

array([ 5, 10, 15, 20, 25, 30])

In [4]:
# grab element by index
pandaSeries[3]
# this will grab the 3rd element 

20

## Pandas Series as a dict

In [5]:
# you can create a Pandas Series using a Python dictionary
# note: Pandas allows string index, not allowed in Numpy

pandaSeries = pd.Series([5,10,15,20,25,30],
                       index= ['A1','B2','C3','D4','E5','F6'])

pandaSeries

A1     5
B2    10
C3    15
D4    20
E5    25
F6    30
dtype: int64

In [6]:
# grab the element from the Series
pandaSeries['B2']

10

### Series **non-sequential** index 

In [7]:
# can create weird index 
# Pandas Series (non-sequential indices)

non_seq_indx = pd.Series([40,50,60,70],
                        index= [1.2, 1.0, 1.4, 1.3])

# the index is not in order
non_seq_indx

1.2    40
1.0    50
1.4    60
1.3    70
dtype: int64

In [9]:
non_seq_indx = pd.Series([40,50,60,70],
                        index= ['alpha','bravo','delta','gamma'])
non_seq_indx

alpha    40
bravo    50
delta    60
gamma    70
dtype: int64

In [115]:
# Python dictionary

# some planets of the Delta quadrant and population
population_dict = {'Drayan': 383332521,
                   'Etanian': 126448193,
                   'Garenor': 119651127,
                   'Kazon': 129552860,
                   'Nasari': 142882135,
                   ''
                  }

# TURN DICTIONARY TO PANDAS SERIES
population = pd.Series(population_dict)
#population

# population Series (created from dict) 
# selecting the column 'population'
p = pd.DataFrame(population, columns=['Population'])
p

Unnamed: 0,Population
Drayan,383332521
Etanian,126448193
Garenor,119651127
Kazon,129552860
Nasari,142882135


In [116]:
# access a dictionary element

# get the population for planet Kazon
population['Kazon']

129552860

In [117]:
# Slice the Panda Series for only Garenor to Nasari

population['Garenor':'Nasari']

Garenor    119651127
Kazon      129552860
Nasari     142882135
dtype: int64

In [118]:
# data can also be a dictionary, 
# index is by default sorted by keys

pd.Series( 
    {8:'Episilon-Tango-8',
     6:'Delta-Alpha-6', 
     3:'Tango-Bravo-3'} )

8    Episilon-Tango-8
6       Delta-Alpha-6
3       Tango-Bravo-3
dtype: object

## Making a DataFrame

In [119]:
# drinks served at Quark's bar on DS9
# name and alcohol by volume level %

beverages = pd.Series({
    'Organia Crush': 6.7,
    'Risa Tequila Splash': 7.3,
    'Trill Swill': 4.5,
    'Barzan Man': 8.8,
    'Ardana Banana': 7.8,
    'Remus Rum Drum': 9.6,
    'Arrithean Gin Mint': 8.5,
    'Karemma Berry Punch': 9.9,
    'Dosi Tula-Mosh': 7.5,
    'Brax the Max': 5.5
    })

orders = pd.Series({
    'Organia Crush': 34,
    'Risa Tequila Splash': 247,
    'Trill Swill': 28,
    'Barzan Man': 93,
    'Ardana Banana': 181,
    'Remus Rum Drum': 9,
    'Arrithean Gin Mint': 145,
    'Karemma Berry Punch': 284,
    'Dosi Tula-Mosh': 147,
    'Brax the Max': 58
    })

Quarks_drinks = pd.DataFrame({
    'Alcohol_Level':beverages,
    'Num_of_orders':orders,
    })

Quarks_drinks

Unnamed: 0,Alcohol_Level,Num_of_orders
Organia Crush,6.7,34
Risa Tequila Splash,7.3,247
Trill Swill,4.5,28
Barzan Man,8.8,93
Ardana Banana,7.8,181
Remus Rum Drum,9.6,9
Arrithean Gin Mint,8.5,145
Karemma Berry Punch,9.9,284
Dosi Tula-Mosh,7.5,147
Brax the Max,5.5,58


In [120]:
Quarks_drinks['Num_of_orders']

Organia Crush           34
Risa Tequila Splash    247
Trill Swill             28
Barzan Man              93
Ardana Banana          181
Remus Rum Drum           9
Arrithean Gin Mint     145
Karemma Berry Punch    284
Dosi Tula-Mosh         147
Brax the Max            58
Name: Num_of_orders, dtype: int64

In [121]:
Quarks_drinks['Alcohol_Level']

Organia Crush          6.7
Risa Tequila Splash    7.3
Trill Swill            4.5
Barzan Man             8.8
Ardana Banana          7.8
Remus Rum Drum         9.6
Arrithean Gin Mint     8.5
Karemma Berry Punch    9.9
Dosi Tula-Mosh         7.5
Brax the Max           5.5
Name: Alcohol_Level, dtype: float64

## Pandas **loc** and **iloc**
- get elements from a dataframe using <code>.loc</code> for strings
and <code>.iloc</code> for integer index value

In [122]:
# to get the first 3 rows
# and get the 2 columns of dataframe
Quarks_drinks.iloc[:3, :2]

Unnamed: 0,Alcohol_Level,Num_of_orders
Organia Crush,6.7,34
Risa Tequila Splash,7.3,247
Trill Swill,4.5,28


In [123]:
# get just the drink Trill Swill by name
Quarks_drinks.loc['Trill Swill']

Alcohol_Level     4.5
Num_of_orders    28.0
Name: Trill Swill, dtype: float64

### *Fancy* indexing elements

In [125]:
# Fancy indexing

# let's find drinks that have alcohol levels greater than 7
# also show both columns
Quarks_drinks.loc[Quarks_drinks.Alcohol_Level > 7,
                 ['Alcohol_Level','Num_of_orders']]


Unnamed: 0,Alcohol_Level,Num_of_orders
Risa Tequila Splash,7.3,247
Barzan Man,8.8,93
Ardana Banana,7.8,181
Remus Rum Drum,9.6,9
Arrithean Gin Mint,8.5,145
Karemma Berry Punch,9.9,284
Dosi Tula-Mosh,7.5,147


In [130]:
# custom indexing

# the drink Remus Rum Drum now has 30 more orders
# let's change that value
# row, column = new_value
Quarks_drinks.iloc[5,1] = 9+30
Quarks_drinks

Unnamed: 0,Alcohol_Level,Num_of_orders
Organia Crush,6.7,34
Risa Tequila Splash,7.3,247
Trill Swill,4.5,28
Barzan Man,8.8,93
Ardana Banana,7.8,181
Remus Rum Drum,9.6,39
Arrithean Gin Mint,8.5,145
Karemma Berry Punch,9.9,284
Dosi Tula-Mosh,7.5,147
Brax the Max,5.5,58


In [139]:
# the dataframe slice
Quarks_drinks[3:7]

Unnamed: 0,Alcohol_Level,Num_of_orders
Barzan Man,8.8,93
Ardana Banana,7.8,181
Remus Rum Drum,9.6,39
Arrithean Gin Mint,8.5,145


In [140]:
Quarks_drinks['Barzan Man': 'Arrithean Gin Mint']

Unnamed: 0,Alcohol_Level,Num_of_orders
Barzan Man,8.8,93
Ardana Banana,7.8,181
Remus Rum Drum,9.6,39
Arrithean Gin Mint,8.5,145


In [141]:
# masking

# dataframe [ dataframe dot columnName <condition> ]
# get drinks that have higher than 8 alcohol level
Quarks_drinks[Quarks_drinks.Alcohol_Level >8]

Unnamed: 0,Alcohol_Level,Num_of_orders
Barzan Man,8.8,93
Remus Rum Drum,9.6,39
Arrithean Gin Mint,8.5,145
Karemma Berry Punch,9.9,284


### Create another DataFrame

In [143]:
# create a dataframe that has holosuite usage per holosuite
# we will randomly generate the values

# randint(60, 402) generate integer numbers from 60 to 401
# (4,4) is 4 rows by 4 column matrix

holosuites = pd.DataFrame(np.random.randint(60,402, (4,4)),
                          columns=['holosuite1',
                                   'holosuite2',
                                   'holosuite3',
                                   'holosuite4'] )

holosuites

Unnamed: 0,holosuite1,holosuite2,holosuite3,holosuite4
0,302,88,254,110
1,205,219,263,347
2,329,134,163,234
3,61,372,285,294


In [152]:
# dataframe [dataframe . column <condition>]
holosuites[holosuites.holosuite1 > 100]

Unnamed: 0,holosuite1,holosuite2,holosuite3,holosuite4
0,302,88,254,110
1,205,219,263,347
2,329,134,163,234


## Pandas DataFrame index with list

In [165]:
# using a list for making a dataframe
# use random integers for values, 3x3 matrix

# * note: number of items in lists must match in length! *

# the Cargo Bay is a busy place, 
# make a dataframe for listing the number of items
# for each Cargo Bay and type of item

CargoBay = pd.DataFrame(
    # generate integers from 2 to 30, 4x4 matrix
    np.random.randint(2,30, (4,4)),
    columns= ['CargoBay_1','CargoBay_2','CargoBay_3','CargoBay_4'],
    index=['weaponry','food','technology','other']
)
CargoBay

Unnamed: 0,CargoBay_1,CargoBay_2,CargoBay_3,CargoBay_4
weaponry,28,28,20,28
food,12,11,7,4
technology,15,14,20,18
other,17,21,27,3


### multi index Series or DataFrame

In [177]:
# let's make a docking log dataframe
# 3 docking stations
# 3 sections 
# 2 doors
# random values for count of ships that have docked

df = pd.DataFrame(np.random.randint(0,200,(6, 3)),
                  index=[['Section A', 
                          'Section A', 
                          'Section B', 
                          'Section B',
                          'Section C',
                          'Section C'], 
                         [1, 2, 1, 2, 1,2]],
                  columns=['dock_1', 'dock_2','dock_3'])
df

Unnamed: 0,Unnamed: 1,dock_1,dock_2,dock_3
Section A,1,106,97,59
Section A,2,41,122,49
Section B,1,192,50,132
Section B,2,68,186,84
Section C,1,138,172,102
Section C,2,87,154,170


## **Merge & Join** dataframes

Merge & Join a Pandas Series or DataFrame. <code>pd.merge()</code>

- 1 to 1 join
- many to 1 join
- many to many join
- Merge options: 
    - left_on=None, 
    - right_on=None, 
    - left_index=False, 
    - right_index=False

## one to one **Join**

In [184]:
# one to one dataframe join

df1 = pd.DataFrame({
    'Officers': [
        'Dax', 'Sisko', 'Nerys','Worf', 'OBrien','Bashir','Odo'], # = 7
    'Department': [
        'Science', 
        'Command', 
        'Command',
        'Operations', 
        'Engineering',
        'Medical',
        'Security'] #= 7
    })

df2 = pd.DataFrame({
    #= 7
    'Officers': ['Dax', 'Sisko','Nerys', 'Worf', 'OBrien','Bashir','Odo'],
    #= 7
    'Honours': [14, 3, 0, 5, 4, 2,0]})

# NOTE: the Series/column 'Officers' is the key for joining tables
df3 = pd.merge(df1, df2) 
df3

Unnamed: 0,Officers,Department,Honours
0,Dax,Science,14
1,Sisko,Command,3
2,Nerys,Command,0
3,Worf,Operations,5
4,OBrien,Engineering,4
5,Bashir,Medical,2
6,Odo,Security,0


## Many to one **Join**

In [208]:
df4 = pd.DataFrame({
    'Officers': [
        'Dax', 'Sisko', 'Nerys','Worf', 'OBrien','Bashir','Odo'],
    'Department': [
        'Science', 
        'Command', 
        'Command',
        'Operations', 
        'Engineering',
        'Medical',
        'Security'
    ],
    'Combat_Experience': [
        1,1,1,1,1,0,0]})

#print(df3)


pd.merge(df3,df4)

Unnamed: 0,Officers,Department,Honours,Combat_Experience
0,Dax,Science,14,1
1,Sisko,Command,3,1
2,Nerys,Command,0,1
3,Worf,Operations,5,1
4,OBrien,Engineering,4,1
5,Bashir,Medical,2,0
6,Odo,Security,0,0


## Many to Many **Join**

In [207]:
df5 = pd.DataFrame({
    'Officers': [
        'Dax', 'Sisko', 'Nerys','Worf', 'OBrien','Bashir','Odo'],
    'Department': [
        'Science', 
        'Command', 
        'Command',
        'Operations', 
        'Engineering',
        'Medical',
        'Security'],
    
    'Skills': [
        'Geospatial Calculus, Problem Solving',
        'Political & Religious diplomacy',
        'Planning & Execution',
        'Combat & Training',
        'Systems Problem Solving',
        'Diagnostics',
        'Criminal Justice'
    ]}
    )

#print(df1)
pd.merge(df3, df5)

Unnamed: 0,Officers,Department,Honours,Skills
0,Dax,Science,14,"Geospatial Calculus, Problem Solving"
1,Sisko,Command,3,Political & Religious diplomacy
2,Nerys,Command,0,Planning & Execution
3,Worf,Operations,5,Combat & Training
4,OBrien,Engineering,4,Systems Problem Solving
5,Bashir,Medical,2,Diagnostics
6,Odo,Security,0,Criminal Justice


### Pandas merge **on** keyword
- you can explicitly specify the name of the key column using the on keyword, which takes a column name or a list of column names:

In [209]:
pd.merge(df1, df2, on='Officers')

Unnamed: 0,Officers,Department,Honours
0,Dax,Science,14
1,Sisko,Command,3
2,Nerys,Command,0
3,Worf,Operations,5
4,OBrien,Engineering,4
5,Bashir,Medical,2
6,Odo,Security,0


In [215]:
df6 = pd.DataFrame({
    'Officers': [
        'Dax', 'Sisko', 'Nerys','Worf', 
        "O'Brien",'Bashir','Odo'],
    'Home_Planet': [
        'Trill','Earth','Bajor','Earth',
        'Earth','Earth','Bajor'] })

pd.merge(df5, df6, left_on='Officers', right_on='Officers')

Unnamed: 0,Officers,Department,Skills,Home_Planet
0,Dax,Science,"Geospatial Calculus, Problem Solving",Trill
1,Sisko,Command,Political & Religious diplomacy,Earth
2,Nerys,Command,Planning & Execution,Bajor
3,Worf,Operations,Combat & Training,Earth
4,Bashir,Medical,Diagnostics,Earth
5,Odo,Security,Criminal Justice,Bajor


In [221]:
# to align the officers to the left
# assign officers to set_index 
# this results in a stacked dataframe
df1a = df1.set_index('Officers')
df2a = df2.set_index('Officers')
df3a = df3.set_index('Officers')
df5a = df5.set_index('Officers')

df1a

Unnamed: 0_level_0,Department
Officers,Unnamed: 1_level_1
Dax,Science
Sisko,Command
Nerys,Command
Worf,Operations
OBrien,Engineering
Bashir,Medical
Odo,Security


In [217]:
df2a

Unnamed: 0_level_0,Honours
Officers,Unnamed: 1_level_1
Dax,14
Sisko,3
Nerys,0
Worf,5
OBrien,4
Bashir,2
Odo,0


In [220]:
df3a

Unnamed: 0_level_0,Department,Honours
Officers,Unnamed: 1_level_1,Unnamed: 2_level_1
Dax,Science,14
Sisko,Command,3
Nerys,Command,0
Worf,Operations,5
OBrien,Engineering,4
Bashir,Medical,2
Odo,Security,0


In [222]:
df5a

Unnamed: 0_level_0,Department,Skills
Officers,Unnamed: 1_level_1,Unnamed: 2_level_1
Dax,Science,"Geospatial Calculus, Problem Solving"
Sisko,Command,Political & Religious diplomacy
Nerys,Command,Planning & Execution
Worf,Operations,Combat & Training
OBrien,Engineering,Systems Problem Solving
Bashir,Medical,Diagnostics
Odo,Security,Criminal Justice


### **left_index** & **right_index** keywords

In [223]:
# LEFT & RIGHT INDEX 
# using the set_index dataframes above
pd.merge(df1a, df2a, left_index=True, right_index=True)

Unnamed: 0_level_0,Department,Honours
Officers,Unnamed: 1_level_1,Unnamed: 2_level_1
Dax,Science,14
Sisko,Command,3
Nerys,Command,0
Worf,Operations,5
OBrien,Engineering,4
Bashir,Medical,2
Odo,Security,0


In [224]:
# use the DataFrame to join table
df1a.join(df2a)

Unnamed: 0_level_0,Department,Honours
Officers,Unnamed: 1_level_1,Unnamed: 2_level_1
Dax,Science,14
Sisko,Command,3
Nerys,Command,0
Worf,Operations,5
OBrien,Engineering,4
Bashir,Medical,2
Odo,Security,0


In [225]:
# RIGHT ON
pd.merge(df1a, df6, left_index=True, right_on='Officers')

Unnamed: 0,Department,Officers,Home_Planet
0,Science,Dax,Trill
1,Command,Sisko,Earth
2,Command,Nerys,Bajor
3,Operations,Worf,Earth
5,Medical,Bashir,Earth
6,Security,Odo,Bajor


### INNER JOIN 
- column value that is shared between ALL tables

In [253]:
df7 = pd.DataFrame({
    'ENT_1701': [
        'LaForge',
        "O'Brien", 
        'Worf',
        'Crusher'
    ],  
    'Conflicts': [
        'Romulans', 'Cardassians','Borg','Borg']},
    columns=['ENT_1701', 'Conflicts'])


df8 = pd.DataFrame({
    'DS9': [
        "O'Brien",
        'Bashir',
        'Worf',
        'Dax',
      ], 
    'Conflicts': [
        'Cardassians','Klingons', 'Dominion','Tosk']},
    columns=['DS9', 'Conflicts'])


pd.merge(df7, df8) 
# shows only common items between both tables

Unnamed: 0,ENT_1701,Conflicts,DS9
0,O'Brien,Cardassians,O'Brien


### **inner, outer, left, right Join**

In [254]:
pd.merge(df7, df8, how='inner')

Unnamed: 0,ENT_1701,Conflicts,DS9
0,O'Brien,Cardassians,O'Brien


In [255]:
pd.merge(df7, df8, how='outer')

Unnamed: 0,ENT_1701,Conflicts,DS9
0,LaForge,Romulans,
1,O'Brien,Cardassians,O'Brien
2,Worf,Borg,
3,Crusher,Borg,
4,,Klingons,Bashir
5,,Dominion,Worf
6,,Tosk,Dax


In [256]:
pd.merge(df7, df8, how='left')

Unnamed: 0,ENT_1701,Conflicts,DS9
0,LaForge,Romulans,
1,O'Brien,Cardassians,O'Brien
2,Worf,Borg,
3,Crusher,Borg,


In [257]:
pd.merge(df7, df8, how='right')

Unnamed: 0,ENT_1701,Conflicts,DS9
0,O'Brien,Cardassians,O'Brien
1,,Klingons,Bashir
2,,Dominion,Worf
3,,Tosk,Dax


In [261]:
df9 = pd.DataFrame({'name': [
    'Picard', 'Riker', 'Worf', 'LaForge'],
                    'clearance': [1, 2, 3, 4]})
df10 = pd.DataFrame({'name': [
    'Picard', 'Riker', 'Worf', 'LaForge'],
                    'clearance': [3, 1, 4, 2]})

# both DataFrames have 'rank' 
# Pandas appends a suffix to avoid errors
pd.merge(df9, df10, on="name")

Unnamed: 0,name,clearance_x,clearance_y
0,Picard,1,3
1,Riker,2,1
2,Worf,3,4
3,LaForge,4,2


In [262]:
# customize the suffix 

pd.merge(df9, df10, on="name", suffixes=['_L','_R'])

Unnamed: 0,name,clearance_L,clearance_R
0,Picard,1,3
1,Riker,2,1
2,Worf,3,4
3,LaForge,4,2


this is the end of the Pandas basics. Next is working with data. There are many more things to do with Pandas that I left out and so for further discovery, read the Pandas documentation with a simple web search.