In [1]:
#Importing the pandas library and aliasing it as 'pd' for quicker coding
import pandas as pd

In [2]:
#One of the main components of pandas are Series - very similar to a standard list in Python
mySeries = pd.Series(['Noah', 'Hari', 'Michael', 'Zach', 'Pralad'])
print(mySeries)

#It's very easy to access any element in the series by indexing
print(mySeries[0])

#Slicing is also available with series
print(mySeries[1:3])

#You can also select multiple elements that aren't next to eachother by using double brackets
print(mySeries[[0,1,3]])

0       Noah
1       Hari
2    Michael
3       Zach
4     Pralad
dtype: object
Noah
1       Hari
2    Michael
dtype: object
0    Noah
1    Hari
3    Zach
dtype: object


In [3]:
#We can also have boolean conditions that exclude certain elements based on your parameters
parameter = mySeries != 'Noah'
print(mySeries[parameter])#You can also just type the condition within the brackets instead of creating a variable

1       Hari
2    Michael
3       Zach
4     Pralad
dtype: object


In [4]:
#You can convert dictionaries into a series
myDict = {'New York': 500, 'Chicago': 700, 'San Francisco': 900, 'Los Angeles': 300}
cities = pd.Series(myDict)
print(cities)

#Slicing works with even in this scenario
print(cities[['Chicago', 'San Francisco']])

#Booleans work the same too
print(cities[cities < 600])

New York         500
Chicago          700
San Francisco    900
Los Angeles      300
dtype: int64
Chicago          700
San Francisco    900
dtype: int64
New York       500
Los Angeles    300
dtype: int64


In [5]:
#You may also change values based on your boolean
cities[cities < 600] = 15000
print(cities)

New York         15000
Chicago            700
San Francisco      900
Los Angeles      15000
dtype: int64


In [6]:
#The next main component of pandas is the DataFrame structure. Think of it as a database table.

#If you're starting from scratch, you can create your own dataframe
data = {'apples': [3,2,0,1], 'oranges': [4,5,0,2]} #Creating two columns: apples and oranges and 4 rows
myDataFrame = pd.DataFrame(data)
print(myDataFrame)

#By default, the index on the lefthand size are just incrementing numbers but we can change it to what we want
monthlyPurchases = pd.DataFrame(data, index = ['January', 'February', 'March', 'April'])
print(monthlyPurchases)

#Let's say we want to see our purchases for April, we would use the 'loc' keyword for a row
print(monthlyPurchases.loc['April'])

   apples  oranges
0       3        4
1       2        5
2       0        0
3       1        2
          apples  oranges
January        3        4
February       2        5
March          0        0
April          1        2
apples     1
oranges    2
Name: April, dtype: int64


In [7]:
#The next logical step is to read in CSV files to your dataframe, my example is using the Titanic dataset which includes 
#data about people aboard the titanic. Change this to fit your dataset.
titanic = pd.read_csv(r'C:\Users\noahe\Documents\Titanic.txt', sep = '\t') #My data is separated by a tab

#Using the head function to print the first 5 rows of our data set
print(titanic.head()) 

  pclass    age   sex survived
0    1st  adult  male      yes
1    1st  adult  male      yes
2    1st  adult  male      yes
3    1st  adult  male      yes
4    1st  adult  male      yes


In [8]:
#Each column of a dataframe is a series object. Using indexing to select a column will return a series object
print(titanic['age'])

#Selecting more than one column will return a dataframe object
print(titanic[['age', 'pclass']])

#Booleans work on dataframes as well
#Let's say we want only the children aboard the ship that did not survive
print(titanic[(titanic.age=='child') & (titanic.survived=='no')])

0       adult
1       adult
2       adult
3       adult
4       adult
        ...  
2196    adult
2197    adult
2198    adult
2199    adult
2200    adult
Name: age, Length: 2201, dtype: object
        age pclass
0     adult    1st
1     adult    1st
2     adult    1st
3     adult    1st
4     adult    1st
...     ...    ...
2196  adult   crew
2197  adult   crew
2198  adult   crew
2199  adult   crew
2200  adult   crew

[2201 rows x 2 columns]
     pclass    age     sex survived
1250    3rd  child    male       no
1251    3rd  child    male       no
1252    3rd  child    male       no
1253    3rd  child    male       no
1254    3rd  child    male       no
1255    3rd  child    male       no
1256    3rd  child    male       no
1257    3rd  child    male       no
1258    3rd  child    male       no
1259    3rd  child    male       no
1260    3rd  child    male       no
1261    3rd  child    male       no
1262    3rd  child    male       no
1263    3rd  child    male       no
1264    3rd  c

In [9]:
#Slicing can also be done on rows by using typical list slicing syntax
print(titanic[320:330])

    pclass    age     sex survived
320    1st  child    male      yes
321    1st  child    male      yes
322    1st  child    male      yes
323    1st  child    male      yes
324    1st  child  female      yes
325    2nd  adult    male      yes
326    2nd  adult    male      yes
327    2nd  adult    male      yes
328    2nd  adult    male      yes
329    2nd  adult    male      yes


In [10]:
#Pandas also allows for a variety of SQL-like functions such as the GROUP BY method
#Let's say we want a count of passengers by age group
byAge = titanic.groupby('age')

#Then printing out the numbers
print(byAge.size())

#Doing the same but grouping by sex
print(titanic.groupby('sex').size())

#You can also group by multiple columns
print(titanic.groupby(['survived', 'sex']).size())

age
adult    2092
child     109
dtype: int64
sex
female     470
male      1731
dtype: int64
survived  sex   
no        female     126
          male      1364
yes       female     344
          male       367
dtype: int64


In [11]:
#There's much more in Pandas but this should get you started