In [2]:
import numpy as np
import pandas as pd

# Building Series

In [3]:
#Series are built on top of numpy arrays
#To create a series you need data (list) and labels (indicies)
#Series are 1 dimensional, dataframes are 2 dimensional
our_list = ['s', 'e', 'r', 'i', 'e', 's']
our_labels = [1,2,3,4,5,6]

#Creating the series:
series = pd.Series(data=our_list, index = our_labels)
series


1    s
2    e
3    r
4    i
5    e
6    s
dtype: object

In [4]:
#Making another series that automatically indexes your entries
array1 = np.array(['a', 'b', 'c', 'd', 'e', 'f', 
                  'g', 'h', 'i', 'j', 'k', 'l', 
                  'm', 'n', 'o', 'p', 'q', 'r', 
                  's', 't', 'u', 'v', 'w', 'x', 
                  'y', 'z', ])
series2 = pd.Series(array1)
series2

0     a
1     b
2     c
3     d
4     e
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object

In [5]:
#You can make series out of dictionaries:

dictionary = {"planet":"Earth", "species":"human", "matter":"organic",}
series3 = pd.Series(dictionary)
series3


series3["planet"] #You can get data by using the label as your index
series3.dtype #Gets the data type for that series

##Can perform operations on series
data2 = np.array([1,2,3,4,5])
series5 = pd.Series(data2)

print("Addition:", series5 + series5)
print("Subtraction: ", series5 - series5)
print("Multiplication: ", series5 * series5)
print("Division: ", series5 / series5)

#Main difference between series and numpy array is series align by label. 
#You can also name a series 

data3 = data2*5
series6 = pd.Series(data3, name = 'Name of series 6')
series6.name


Addition: 0     2
1     4
2     6
3     8
4    10
dtype: int32
Subtraction:  0    0
1    0
2    0
3    0
4    0
dtype: int32
Multiplication:  0     1
1     4
2     9
3    16
4    25
dtype: int32
Division:  0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
dtype: float64


'Name of series 6'

# Building Dataframes
These are made up of multiple series that share the same index or label, and may contain multiple different data types. 

In [6]:
#If you use the print function to call a dataframe, it will not look as nice as when you just call the dataframe as the
#last statement in the cell. It will instead look like raw text in need of formatting. 
#Making another data frame
a = np.random.randint(10,50, size=(2,3))
dataframe1 = pd.DataFrame(a, ['A', 'B'], ['C', 'D','E']) #AB are the row labels, CDE are the column labels. 
dataframe1

Unnamed: 0,C,D,E
A,29,33,19
B,49,24,33


In [7]:
#Can make a dataframe with multiple series

d ={'one':pd.Series([1.,2.,3.,4.], index = ['a','b','c','d']),
    'two':pd.Series([2,4,6,8], index = ['a','b','c','d'])}
dataframe2 = pd.DataFrame(d)
dataframe2

#from_dict will accept column labels and lists
pd.DataFrame.from_dict(dict([('a', [1,2,3]), ('b', [4,5,6])]))
#orient (to the index) allows you to assign the keys as row labels and column labels seperately
pd.DataFrame.from_dict(dict([('a', [1,2,3]), ('b', [4,5,6])]),
                      orient = 'index', columns = [ 'one', 'two', 'three'])
                            
#finding out number of rows and columns in a dataframe
print(dataframe2.shape)

(4, 2)


# Editing and Retrieving Entries in a Dataframe

In [8]:
dataframe1

Unnamed: 0,C,D,E
A,29,33,19
B,49,24,33


In [9]:
dataframe1['C'] #Retrieving a column by calling its key

A    29
B    49
Name: C, dtype: int32

In [10]:
dataframe1[['C', 'D']] #Retrieving multiple columns by calling their keys in a list

Unnamed: 0,C,D
A,29,33
B,49,24


In [11]:
dataframe1.loc['A'] #Retrieving a row by calling its label 

C    29
D    33
E    19
Name: A, dtype: int32

In [12]:
dataframe1.iloc[0] #Retrieving the first row by calling its index position, 0

C    29
D    33
E    19
Name: A, dtype: int32

In [13]:
dataframe1.loc['A', 'C'] #Grab a cell with a row and a column

29

In [14]:
dataframe1.loc[['A', 'B'], ['D', 'E']]#Get multiple cells by defining the rows and columns wanted 

Unnamed: 0,D,E
A,33,19
B,24,33


In [15]:
#Can build new columns, let's say you want the total of the columns 
dataframe1['Total'] = dataframe1['C']+dataframe1['E']
dataframe1 #contains new column we just built

Unnamed: 0,C,D,E,Total
A,29,33,19,48
B,49,24,33,82


In [16]:
#Again but with multiplication
dataframe2['Mult'] = dataframe2['one'] * dataframe2['two']
dataframe2

Unnamed: 0,one,two,Mult
a,1.0,2,2.0
b,2.0,4,8.0
c,3.0,6,18.0
d,4.0,8,32.0


In [17]:
#Doing the same but with a row 
dict2 = {'C':44, 'D':45, 'E':46}
new_row = pd.Series(dict2, name='F')
dataframe1 = dataframe1.append(new_row)
dataframe1 #This only has a nan because our previously defined Total column wasn't set up to work with this new row

Unnamed: 0,C,D,E,Total
A,29.0,33.0,19.0,48.0
B,49.0,24.0,33.0,82.0
F,44.0,45.0,46.0,


In [18]:
#Can delete columns 
dataframe1.drop('Total',axis=1, inplace = True)
dataframe1 #Can only run this cell once obviously because once you delete Total you can't delete it again

Unnamed: 0,C,D,E
A,29.0,33.0,19.0
B,49.0,24.0,33.0
F,44.0,45.0,46.0


In [19]:
dataframe1.drop('B', axis=0, inplace = True)
dataframe1

Unnamed: 0,C,D,E
A,29.0,33.0,19.0
F,44.0,45.0,46.0


In [20]:
#Create a new column and make it the index
dataframe1['Sex'] = ['Men', 'Women']
dataframe1.set_index('Sex', inplace = True)
dataframe1

Unnamed: 0_level_0,C,D,E
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Men,29.0,33.0,19.0
Women,44.0,45.0,46.0


In [21]:
#How to reset index values back to numbers
#dataframe1.reset_index(inplace=True)
#dataframe1

In [22]:
dataframe2.assign(division=dataframe2['one'] / dataframe2['two'])

Unnamed: 0,one,two,Mult,division
a,1.0,2,2.0,0.5
b,2.0,4,8.0,0.5
c,3.0,6,18.0,0.5
d,4.0,8,32.0,0.5


In [23]:
#You can use ".assign()" to pass in a function so we'll do an anonymous function lambda here
dataframe2.assign(division=lambda x: (x['one'] / x['two']))
dataframe2



Unnamed: 0,one,two,Mult
a,1.0,2,2.0
b,2.0,4,8.0
c,3.0,6,18.0
d,4.0,8,32.0


In [24]:
#Can combine dataframes
dataframe3 = pd.DataFrame({'A': [1., np.nan, 3., np.nan]})
dataframe4 = pd.DataFrame({'A': [8., 9., 2., 4. ]})
dataframe3.combine_first(dataframe4)

Unnamed: 0,A
0,1.0
1,9.0
2,3.0
3,4.0


# Conditional Selection of Data

In [25]:
#Starting with a new dataframe
arr_2 = np.random.randint(10, 50, size = (2,3))
dataframe1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])
print (dataframe1)

    C   D   E
A  40  43  42
B  24  48  26


<!-- Tables -->
| Comparison Op. | Shorthand|
| -------- | --------------  |
|  Greater than | gt|                     
| Less than | lt |  
| Greater than or equal | ge|   
| Less than or equal | le|
|   Equal | eq|  
|Not Equal| ne|  


In [26]:
print('Greater than 40 \n', dataframe1 > 40)
print('Greater than 40 \n', dataframe1.gt(40.)      
#Works with lt, ge, le, eq, ne (less than, greater or equal, less or equal, equal, not equal)

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_6432/3984872200.py, line 3)

In [27]:
#Can place conditions in brackets
bool_1 = dataframe1 >=45.0
dataframe1[bool_1]

Unnamed: 0,C,D,E
A,,,
B,,48.0,


In [28]:
#can also get booleans for a column
dataframe1['E'] > 40

A     True
B    False
Name: E, dtype: bool

In [29]:
#Suppose we want to know if the cell value for a column matches a condition
#stack up the dataframes and ask for the column and your condition:
dataframe1[dataframe1['E'] > 30]

Unnamed: 0,C,D,E
A,40,43,42


In [30]:
#Can focus on a column based on a resulting dataframe:
dataframe2 = dataframe1[dataframe1['E'] > 30]
dataframe2['C']

A    40
Name: C, dtype: int32

In [31]:
#Can stack the commands
print(dataframe1[dataframe1['E'] > 20]['C'])

A    40
B    24
Name: C, dtype: int32


In [32]:
#Can grab multiple different columns
print(dataframe1[dataframe1['E'] > 20][['C', 'D']])

    C   D
A  40  43
B  24  48


In [33]:
#Can select for multiple different conditions
arr_3 = np.array([[1,2,3], [4,5,6], [7,8,9]])
df_2= pd.DataFrame(arr_3, ['A', 'B', 'C'], ['X', 'Y', 'Z'])
print(df_2)
df_2[(df_2['X'] > 3) & (df_2['X'] < 7)] #Using and
# df_2[(df_2['X'] > 3) | (df_2['X'] < 7)] #Using or, which returns all values

   X  Y  Z
A  1  2  3
B  4  5  6
C  7  8  9


Unnamed: 0,X,Y,Z
B,4,5,6


# File I/O

In [34]:
cs_df = pd.read_csv('ComputerSales.csv')
cs_df

#If you make changes and want to save it to a csv file 
cs_df.to_csv('ComputerSales_backup.csv')

#Can also read data from an excel file
pd.read_excel('Financial Sample.xlsx', 0)

#Can also write to an excel file
cs_df.to_excel('ComputerSales_backup.xlsx')
#Can check if its been written
pd.read_excel('ComputerSales_backup.xlsx') #and indeed it was written

Unnamed: 0.1,Unnamed: 0,Sale ID,Contact,Sex,Age,State,Product ID,Product Type,Sale Price,Profit,Lead,Month,Year
0,0,1,Paul Thomas,M,43,OH,M01-F0024,Desktop,479.99,143.39,Website,January,2018
1,1,2,Margo Simms,F,37,WV,GT13-0024,Desktop,1249.99,230.89,Flyer 4,January,2018
2,2,3,Sam Stine,M,26,PA,I3670,Desktop,649.99,118.64,Website,February,2018
3,3,4,Moe Eggert,M,35,PA,I3593,Laptop,399.99,72.09,Website,March,2018
4,4,5,Jessica Elk,F,55,PA,15M-ED,Laptop,699.99,98.09,Flyer 4,March,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,97,98,Michelle Samms,F,46,NY,17-BY3613DX,Laptop,609.99,140.34,Flyer 3,November,2019
98,98,99,Mick Roberts,M,23,PA,BB980,Desktop,889.99,110.89,Flyer 2,May,2019
99,99,100,Ed Klondike,M,52,OH,15M-ED0023DX,Laptop,989.99,111.34,Email,November,2019
100,100,101,Moe Eggert,M,35,PA,GMA4000BST,Desktop,589.99,138.64,Website,May,2019


# Basics and Math

In [35]:
cs_df.head() #first 5 results
cs_df.tail() #last 5 results
cs_df[:2] #first two results, just use a slice
cs_df[:5:2] #first five results in steps of two
cs_df.index.array #gets indices
cs_df.to_numpy() #converts it all into a numpy array, which is very large
series2.array #covnerting a series to an array

<PandasArray>
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Length: 26, dtype: object

In [36]:
#Creating another dataframe to work with
dict_3 = {'one':pd.Series([1,2,3], index=['a', 'b', 'c']),
         'two':pd.Series([1,2,3,4], index=['a', 'b', 'c','d'])}
df_2 = pd.DataFrame(dict_3)
print(df_2)
#Can replace any nan values using fillna function
print(df_2.fillna(0, inplace = True)) #if there are any nan values they'll be replaced with zeros

#Get the values in row 2
row  = df_2.iloc[1]
#Can add items in row 2 to all the other rows including row 2
df_2.add(row, axis='columns')

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
None


Unnamed: 0,one,two
a,3.0,3.0
b,4.0,4.0
c,5.0,5.0
d,2.0,6.0


In [37]:
#get column two and subtract that from all other columns
col = df_2['two']
df_2.sub(col, axis = 0)

Unnamed: 0,one,two
a,0.0,0
b,0.0,0
c,0.0,0
d,-4.0,0


In [38]:
#Creating a new dataframe to work with and use transform
df_5 = pd.DataFrame({'A':range(3), 'B': range(1,4)})
df_5

Unnamed: 0,A,B
0,0,1
1,1,2
2,2,3


In [39]:
#Add 1 to every value in df_5
#df_5.transform(lambda x: x+1) #Adds 1 to every value 
#df_5.transform(lambda x: x**2) #squares every value 
#df_5.transform(lambda x: np.sqrt(x)) #Takes the square root of every value

#Can also transform using multiple different functions
df_5.transform([lambda x: x**2, lambda x: x**3])


Unnamed: 0_level_0,A,B
Unnamed: 0_level_1,<lambda>,<lambda>
0,0,1
1,1,8
2,8,27


In [40]:
#Passing a dictionary is going to allow you to different calculations on different columns::
df_5.transform({'A': lambda x: x**2, 'B': lambda x: x**3})

Unnamed: 0,A,B
0,0,1
1,1,8
2,4,27


In [41]:
#Map allows you to perform a similar function on a series 
df_5['A'].map(lambda x: x**2)

0    0
1    1
2    4
Name: A, dtype: int64

In [42]:
#applymap is going to be able to do the same thing for a dataframe
df_5.applymap(lambda x: x**2)

Unnamed: 0,A,B
0,0,1
1,1,4
2,4,9


In [43]:
#Can get unique values, lets try column 2
df_2['two'].unique()
df_2['two'].nunique() #Gives total number of unique values
df_2['two'].value_counts()#Can also find out the number of times a value occurs in a column
df_2.columns #can also ask for column names
df_2.index #Gives index information
df_2.isnull #return a dataframe that returns any null values as True


<bound method DataFrame.isnull of    one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  0.0    4>

# Group Data

In [71]:
dict_5 = {'Planet': [1,2,3,4,5,6,7,8,9], 'Number of Moons': [0,0,1,2,79,82,27,14,5]}

df_11 = pd.DataFrame(dict_5)
#Grouping data by the number of moons per planet
by_store = df_11.groupby('Number of Moons')
by_store.mean()
by_store.sum().loc[1]#moon total for planet by index
by_store.describe() #gets a whole bunch of data as seen below

Unnamed: 0_level_0,Planet,Planet,Planet,Planet,Planet,Planet,Planet,Planet
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Number of Moons,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0
1,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
2,1.0,4.0,,4.0,4.0,4.0,4.0,4.0
5,1.0,9.0,,9.0,9.0,9.0,9.0,9.0
14,1.0,8.0,,8.0,8.0,8.0,8.0,8.0
27,1.0,7.0,,7.0,7.0,7.0,7.0,7.0
79,1.0,5.0,,5.0,5.0,5.0,5.0,5.0
82,1.0,6.0,,6.0,6.0,6.0,6.0,6.0


# Concatenate, Merge, and Join Data

In [45]:
df_12 = pd.DataFrame({'A': [1,2,3],
                     'B': [4,5,6]},
                     index = [1,2,3])
df_13 = pd.DataFrame({'A': [7,8,9],
                     'B': [10,11,12]},
                     index = [4,5,6])
df_12

Unnamed: 0,A,B
1,1,4
2,2,5
3,3,6


In [46]:
df_13

Unnamed: 0,A,B
4,7,10
5,8,11
6,9,12


In [47]:
#Concatenating these together
pd.concat([df_12, df_13])

Unnamed: 0,A,B
1,1,4
2,2,5
3,3,6
4,7,10
5,8,11
6,9,12


In [48]:
df_12 = pd.DataFrame({'A': [1,2,3],
                     'B': [4,5,6],
                     'key' : [1,2,3]})
df_13 = pd.DataFrame({'A': [7,8,9],
                     'B': [10,11,12],
                     'key' : [1,2,3]})
#merging two dataframes when their key column is the same

pd.merge(df_12, df_13, how = 'right', on = 'key') #inner merges at the intersection of the join
#if how = left or right, then you're saying you want the keys from the left or right frame
#outer merges by the union of the keys

Unnamed: 0,A_x,B_x,key,A_y,B_y
0,1,4,1,7,10
1,2,5,2,8,11
2,3,6,3,9,12


In [49]:
#How to join two data frames with different indices, instead of keys you use columns
df_12 = pd.DataFrame({'A': [1,2,3],
                     'B': [4,5,6]},
                     index = [1,2,3])
df_13 = pd.DataFrame({'C': [7,8,9],
                     'D': [10,11,12]},
                     index =  [1,4,5])
df_12.join(df_13, how = 'outer') #You'll get nan values wherever they do not line up

Unnamed: 0,A,B,C,D
1,1.0,4.0,7.0,10.0
2,2.0,5.0,,
3,3.0,6.0,,
4,,,8.0,11.0
5,,,9.0,12.0


# Statistics

In [54]:
#This is just a spreadsheet about the planets in our solar system, including PLuto, regarind the
#length of their day in hours and the length of their year in Earth-days
planets_df = pd.read_csv('dayvsyear.csv')
planets_df

Unnamed: 0,Day (hours),Year (days)
0,4222.6,88.0
1,2802.0,224.7
2,24.0,365.2
3,24.7,687.0
4,9.9,4331.0
5,10.7,10747.0
6,17.2,30589.0
7,16.1,59800.0
8,153.3,90560.0


In [58]:
planets_df.count()# Get count of both columns
planets_df.sum(skipna=True)#skips nan values if there are any

planets_df["Year (days)"].mean()
planets_df["Year (days)"].median()
planets_df["Year (days)"].mode()
planets_df["Year (days)"].min()
planets_df["Year (days)"].max()
planets_df["Year (days)"].prod() #Product of values
planets_df["Year (days)"].std() #Standard deviation
planets_df["Year (days)"].var() #Variance 
planets_df["Year (days)"].sem() #Standard error
print("Mean days in a year: ", planets_df["Year (days)"].mean())
print("Median days in a year: ", planets_df["Year (days)"].median())
print("Mode days in a year: ", planets_df["Year (days)"].mode())
print("Min days in a year: " , planets_df["Year (days)"].min())
print("Max days in a year: ", planets_df["Year (days)"].max())
print("Product of all values in Year (days): ", planets_df["Year (days)"].prod())
print("Standard Deviation: ", planets_df["Year (days)"].std()) 
print("Variance: ", planets_df["Year (days)"].var())
print("Standard Error: ", planets_df["Year (days)"].sem()) 
planets_df["Year (days)"].skew() #If negative, there exists a long tail towards the left; if positive then right.

planets_df["Year (days)"].kurt() #Tells you how many outliers you have in your data
# less than three: low, exactly 3: normal, greater than 3: high

planets_df["Year (days)"].quantile(.25) #Need to specify which quantile you want, this is the 25% quantile
planets_df["Year (days)"].cumsum() #Cumulative sum of your data
planets_df["Year (days)"].cumprod() #Cumulative product, overloads so I don't reccomend printing it
planets_df["Year (days)"].cummax() #Cumulative max
planets_df["Year (days)"].cummin() #Cumulative min

Mean days in a year:  21932.433333333334
Median days in a year:  4331.0
Mode days in a year:  0       88.0
1      224.7
2      365.2
3      687.0
4     4331.0
5    10747.0
6    30589.0
7    59800.0
8    90560.0
dtype: float64
Min days in a year:  88.0
Max days in a year:  90560.0
Product of all values in Year (days):  3.825177216887456e+31
Standard Deviation:  32631.416950999846
Variance:  1064809372.23
Standard Error:  10877.138983666615


0    88.0
1    88.0
2    88.0
3    88.0
4    88.0
5    88.0
6    88.0
7    88.0
8    88.0
Name: Year (days), dtype: float64