In [1]:
import pandas as pd

In [2]:
# reading a csv table into a variable
superstore = pd.read_csv("Sample - EU Superstore.csv")
# reading the excel table into a variable
orders = pd.read_excel("orders.xlsx")

In [None]:
display(superstore.head())
superstore.tail()

In [None]:
superstore

# Slicing

In [None]:
# shows the first 5 rows of the table
superstore.head()

In [None]:
#shows the last 10 rows of the table
superstore.tail(10)

In [None]:
# this returns an iterable columns object
superstore.columns
list(superstore.columns)

for col in superstore.columns:
     # do something with a column name;
    print(col)


In [None]:
%%time
# this returns an iterable index object
superstore.index

for rownum in superstore.index:
    # do something with a row
    print(rownum)

# # not the best way to iterate through all the rows. 

In [None]:
superstore.head()

In [None]:
%%time
# vector operations are the fastest way to calculate someting for each row

superstore['unit price'] = superstore['Sales']/superstore['Quantity']

# other alternatives are:
# df['col'] = list(map(func, df['col']))
# df['col'] = df['col'].apply(func)
# for row in df.iterrows():
# for row in df.itertuples():


In [None]:
superstore.head()

In [None]:
# how can I slice one or more columns

# slicing one column
superstore["City"]

In [None]:
#Slicing multiple columns

superstore[['City', "Country",'Region']]
# also useful for reordering columns: superstore = superstore[['new column 1', 'new column 2', ...., ''new column last]]

In [None]:
#Slicing one column as a DataFrame, in stead of a Series
superstore[["City"]]

# Basic exploration & computation

In [None]:
#some useful column methods
# unique method of pandas 
superstore['Country'].nunique()  # COUNT(DISTINCT Country)
superstore['Country'].unique()   # DISTINCT Country
superstore['Country'].value_counts() # COUNT(*) + GROUP BY Country
superstore['Country'].value_counts(dropna=False)
superstore['Country'].isna().sum()


In [None]:
#import numpy as np
# some useful column aggregations

# the sale with the highest value?
superstore['Sales'].max()  # SELECT MAX(Sales)

# what's the average sales and profit value?
superstore[['Sales','Profit']].mean() # SELECT AVG(Sales), AVG(Profit)


In [None]:
#on a Dataframe, a column aggregation function applies to all columns
superstore.max()

In [None]:
# but only on the columns where it makes sense
superstore.mean()


In [None]:
#We can do computations over columns
superstore['Sales']/superstore['Quantity']

# and store the result in a new column
superstore['Unitary price'] = superstore['Sales']/superstore['Quantity']
superstore[['Unitary price','Sales','Quantity']].head()

# # we can also do comparisons
superstore['Large sale'] = superstore['Sales']>100
superstore[['Large sale','Sales']].head()


# Filtering & boolean masks

In [None]:
#a column like the one above (only True/False) is called a Boolean Mask
#Boolean masks are very useful to filter rows in DataFrames
# but we can use filters even if we do not turn our conditions into columns explicitly
superstore['Sales']>100
superstore[superstore['Sales']>100].head() # WHERE Sales > 100 LIMIT 5

In [None]:
# do it yourself
# find the superstore row with the maximum sales








In [None]:
#solution
superstore[ superstore['Sales'] == superstore['Sales'].max() ]


In [None]:
#Boolean masks can also be combined by boolean operators (and, or, not), which in pandas are denoted by (&,|,~)

superstore['Large Sales'] = superstore['Sales']>superstore['Sales'].mean()
superstore['Large Quantity'] =superstore['Quantity']>5
superstore['Salesmean'] = superstore['Sales'].mean()

superstore[['Sales','Quantity','Salesmean','Large Sales','Large Quantity']].head()

superstore['Large Quantity and Sales'] = (superstore['Quantity']>5) & (superstore['Sales']>superstore['Sales'].mean()) # & : AND
superstore['Large Quantity or Sales'] = (superstore['Quantity']>5) | (superstore['Sales']>superstore['Sales'].mean())  # | : OR
superstore['not Large Quantity'] = ~(superstore['Quantity']>5)                                                         # ~ : NOT

superstore[['Sales','Quantity','Salesmean','Large Sales','Large Quantity','Large Quantity and Sales','Large Quantity or Sales','not Large Quantity']].head(10)


In [None]:
#This means these combinations can be used in filters as well
superstore[superstore['City']=='Lisbon']
superstore[['Row ID', 'Ship Date', 'Segment']][superstore['City']=='Lisbon']

superstore[(superstore['City']=='Lisbon')&(superstore['Segment']=='Corporate')]

In [None]:
# interview question -> what condition typically returns more rows? and conditon vs or condition

# Binning

In [None]:
# changing a a numeric variable into a categorical one ('low', 'medium', 'high')
# how to perform binning -> bins on a variable

superstore['Sales']

In [None]:
# cut in 5 equally spaced bins
binnames = ["Very Low", "Low", "Moderate", "High", "Very High"]
bins = pd.cut(superstore['Sales'],5, labels=binnames)
bins
superstore['BinnedSales'] = pd.cut(superstore['Sales'], 5, labels = binnames)
superstore['BinnedSales'].value_counts()


In [None]:
#check result
print(superstore['Sales'].min())
display(superstore['Sales'].max())
print('size of bins = ', (superstore['Sales'].max()-superstore['Sales'].min())/5)
display(superstore[superstore['Sales']<1591.125+2.955].shape)
display(superstore[superstore['Sales']>7958.58-1591.125].shape)

In [None]:
import numpy as np
# qcut in 5 equally filled bins. Why the imbalance? Repeated values
qbins = pd.qcut(superstore['Sales'],5, labels = binnames)
qbins.value_counts()

# np.quantile(superstore['Sales'],0.20)
# superstore[superstore['Sales']==41.64]

In [None]:
# cut in 5 designed bins
bins = pd.cut(superstore['Sales'],[0,10,100,1000,2000,8000], labels = binnames)
bins.value_counts(dropna=False)

In [None]:
%matplotlib inline
superstore['Sales'].hist(bins=100)

In [None]:
# do it yourself: create 3 equally spaced bins for profit
# do it yourself: create one bin for negative profit and 3 equally sized/spaced bins for positive profits

# Row operations - Apply

In [None]:
# We have seen we can make a function over all rows by combining columns
superstore['profit ratio'] = superstore['Profit']/superstore['Sales']
superstore.head()

In [None]:
# what if I want to perform more complicated operations on the rows?
# I want to calculate if a row is not profitable (profit < 0), profitable (0=< profit < 100) or very profitable (profit>=100)

# define a function that creates your logic
# the input will be the value of the profit of each row

def profitable(x):
    if x<=0:
        return 'Not Profitable'
    elif x<100: 
        return 'Profitable'
    else:
        return 'Very Profitable'

profitable(110)

In [None]:
# we have an option already
superstore['Profit-Flag'] = list(map(profitable,superstore['Profit']))
superstore.head()

In [None]:
#but we can use a more general method: apply
superstore['Profit-Flag2'] = superstore['Profit'].apply(profitable)
superstore.head()

# the apply method, will apply the function given to each row in the dataframe, with the respective argument

In [None]:
# What if I want a function that uses data from multiple columns?
# For example I want to see if the delay between shipping and Ordering is larger than 7 days and flag the order as 'delayed' in that case

# use a function that take a whole row as an argument

#this is a good time to learn about dates and timestamps. You can find some info here:
#https://docs.python.org/3/library/datetime.html
from datetime import datetime
from datetime import timedelta

def delayed(row):
    delay = datetime.strptime(row['Ship Date'],'%d/%m/%Y')-datetime.strptime(row['Order Date'],'%d/%m/%Y')
    #essentially converts the date strings from the original table into internal time representations and computes the difference in times 


    if delay>timedelta(days=5):
        #Timedelta represent a period of x days
        return 'Delayed'
    else:
        return 'On-time'

In [None]:
#to run your function over all rows, you haveve to pass along the argument "axis=1"
superstore['Delayed?'] = superstore.apply(delayed, axis=1)
superstore.head()

In [None]:
superstore['Delayed?'].value_counts()

In [None]:
superstore['Ship Mode'].value_counts()

In [None]:
#do it yourself: create a function that categorizes an order as "Priority 1" if 
# - the Segment is Corporate and ship mode is First or Second Class
# - the ship mode is First Class
#and "Priority 2" otherwise 

# Merging

In [None]:
# Merging is the way to combine information from multiple datasources.
#It is functionally the same as JOIN from SQL, even in the names of the methods
returns = pd.read_excel("Sample - EU Superstore.xlsx", sheet_name = "Returns")

In [None]:
returns

In [None]:
# SELECT *
# FROM superstore
# JOIN returns ON superstore.'Order ID' = returns.'Order ID'

In [None]:
superstore.head()

In [None]:
#if we want details only on the returned orders
just_returned_details = pd.merge(left = superstore,
                                 right = returns,
                                 how = 'inner', 
                                 left_on = "Order ID", 
                                 right_on= "Order ID")
just_returned_details
just_returned_details[['Row ID',
 'Order ID',
 'Order Date',
 'Ship Date',
 'Ship Mode',
 'Customer ID',
 'Customer Name',
 'Segment',
 'City',
 'State',
 'Country',
 'Region',
 'Product ID',
 'Category',
 'Sub-Category',
 'Product Name',
 'Sales',
 'Quantity',
 'Discount',
 'Profit','Returned']].shape

In [None]:
# SELECT *
# FROM superstore
# LEFT JOIN returns ON returns.'Order ID' = superstore.'Order ID'

In [None]:
# if we want to keep all orders, and just flag the returned ones, we can do a left merge
just_returned_addition = pd.merge(left = superstore,
                                  right = returns,
                                  how = 'left', 
                                  left_on = "Order ID", 
                                  right_on= "Order ID")
display(just_returned_addition.shape)
just_returned_addition[['Row ID',
 'Order ID',
 'Order Date',
 'Ship Date',
 'Ship Mode',
 'Customer ID',
 'Customer Name',
 'Segment',
 'City',
 'State',
 'Country',
 'Region',
 'Product ID',
 'Category',
 'Sub-Category',
 'Product Name',
 'Sales',
 'Quantity',
 'Discount',
 'Profit','Returned']].head()

In [None]:
just_returned_addition.Returned.value_counts(dropna=False)

In [None]:
just_returned_addition.Returned