# Pandas
Pandas is a package for storing and managing data.

Pandas create two new data types for storing data:

    - Dataframe -> Just like an excel spreadsheet of rows and columns, with each column having it's own data-type and can be named.
    - Series -> Each column of a data-frame.

Pandas has SQL functions like join, merge, sort etc.

In [1]:
import numpy as np # importing for doing maths
import pandas as pd # importing pandas

In [2]:
# SERIES
# create by converting a list or a numpy array
myList = [5.4,6.1,7,99.9]
myArray = np.array(myList)

In [8]:
mySeriesFromList = pd.Series(data=myList,name='from List')
print(mySeriesFromList)
mySeriesFromArray = pd.Series(data=myArray,name='from Array')
print(mySeriesFromArray)

0     5.4
1     6.1
2     7.0
3    99.9
Name: from List, dtype: float64
0     5.4
1     6.1
2     7.0
3    99.9
Name: from Array, dtype: float64


In [10]:
# access elements on index
print(mySeriesFromArray[2])

7.0


In [20]:
# labels to the entries of series -- should be an iterable of smae size
my_labels = ['first','second','third','fourth']
series_with_labels = pd.Series(data=myList,index=my_labels,name='series with labels')
print(series_with_labels)

first      5.4
second     6.1
third      7.0
fourth    99.9
Name: series with labels, dtype: float64


In [21]:
# we can skip passing parameter names, but then they need to be in the order as specified in function
my_series = pd.Series(myList,my_labels)
print(my_series)

first      5.4
second     6.1
third      7.0
fourth    99.9
dtype: float64


In [23]:
# access using labels
print(series_with_labels['second'])

6.1


In [30]:
# math on series
my_series2 = pd.Series(data=[5.5,1.1,8.8,1.6],index=['first','third','fourth','fifth'])
print(my_series2)
print()
'''
Adds elements together having same index/label.
thus, elements not having not same index give NaN(Not a number)
'''
print(my_series + my_series2) 

first     5.5
third     1.1
fourth    8.8
fifth     1.6
dtype: float64

fifth       NaN
first      10.9
fourth    108.7
second      NaN
third       8.1
dtype: float64


In [37]:
# creating a Dataframe by combining Series
# axis=1 ->combine along columns -- places side-by-side
df1 = pd.concat([my_series,my_series2],axis=1,sort=False)
df1

Unnamed: 0,0,1
first,5.4,5.5
second,6.1,
third,7.0,1.1
fourth,99.9,8.8
fifth,,1.6


In [40]:
# sort=True -> sort alphabetically on index/label
df2 = pd.concat([my_series,my_series2],axis=1,sort=True)
df2

Unnamed: 0,0,1
fifth,,1.6
first,5.4,5.5
fourth,99.9,8.8
second,6.1,
third,7.0,1.1


In [103]:
# axis=0 -> combines along rows/index -- stacks on top
series_axis0 = pd.concat([my_series,my_series2],axis=0,sort=False)
series_axis0

first      5.4
second     6.1
third      7.0
fourth    99.9
first      5.5
third      1.1
fourth     8.8
fifth      1.6
dtype: float64

In [42]:
# create a dataframe with default index/row and column labels
df3 = pd.DataFrame(data=np.random.randn(5,5))
df3

Unnamed: 0,0,1,2,3,4
0,1.695146,-2.087588,-0.815378,1.061404,-0.271869
1,0.531227,0.105782,0.761326,-0.755452,0.769386
2,0.779376,0.355605,-0.018314,0.814914,0.933566
3,1.575359,1.179072,2.053695,0.055586,-1.876485
4,-0.297418,1.038913,-3.039022,0.841758,0.73631


In [3]:
df4 = pd.DataFrame(data=np.random.randn(5,5),
                   index=['first row','second row','third row','fourth row','fifth row'],
                  columns=['first*','second col','third col','fourth col','fifth col'])
df4

Unnamed: 0,first*,second col,third col,fourth col,fifth col
first row,1.262188,0.969096,-1.136827,0.545876,-0.554342
second row,-0.20448,-0.120206,-0.109616,-0.327276,1.361297
third row,0.172702,1.565013,-0.792202,-1.042377,-1.057418
fourth row,-1.046654,-0.274334,-0.16336,0.119762,-0.018483
fifth row,-0.917799,-0.646684,0.083133,0.369725,-0.73523


In [5]:
df4.first*

SyntaxError: invalid syntax (<ipython-input-5-4ebf476172c6>, line 1)

In [109]:
# create dataframes using dictionary syntax
'''
pd.DataFrame({'col_name_1':['row value1','row value2'],
'col_name_2':['row value1','row value2']})
'''
df7 = pd.DataFrame(
{
   'customer_id':[1,2,3],
    1:['a','b','c']
    
}
)
df7

Unnamed: 0,customer_id,1
0,1,a
1,2,b
2,3,c


In [61]:
# access series in a dataframe
# using this method we can only use columns to access, because dataframe is list of series
print(df4['first col'])  # --> series
print()

first row    -1.339089
second row    1.927943
third row    -0.930330
fourth row    0.219155
fifth row     0.599705
Name: first col, dtype: float64



Unnamed: 0,third col,second col
first row,-0.771253,0.376734
second row,-3.20236,0.67627
third row,0.355698,0.410228
fourth row,1.238794,-0.413323
fifth row,-0.691377,-1.070305


In [207]:
# for multiple columns use array of columns
df4[['third col','second col']]  # --> multiple series = dataframe

Unnamed: 0,third col,second col
first row,1.148445,0.642563
second row,0.320497,-1.988576
third row,-1.143864,0.489975
fourth row,-0.559088,-0.52552
fifth row,0.157592,-0.237828


In [68]:
# access rows of dataframe
df4.loc['fourth row']  # location 

first col     0.219155
second col   -0.413323
third col     1.238794
fourth col    1.955222
fifth col    -1.454954
Name: fourth row, dtype: float64

In [69]:
# access rows with index location -- because index are saved still after labelling
df4.iloc[2]

first col    -0.930330
second col    0.410228
third col     0.355698
fourth col    0.641310
fifth col     1.283334
Name: third row, dtype: float64

In [73]:
# accessing specific rows and columns
# .loc[[index],[columns]]
df4.loc[['fourth row','first row'],['second col','fourth col']]

Unnamed: 0,second col,fourth col
fourth row,-0.413323,1.955222
first row,0.376734,-1.565313


In [77]:
# using logicals to access
df4>0

Unnamed: 0,first col,second col,third col,fourth col,fifth col
first row,False,True,False,False,False
second row,True,True,False,False,False
third row,False,True,True,True,True
fourth row,True,False,True,True,False
fifth row,True,False,False,False,True


In [78]:
# structure remains of data frame, but for False gives NaN
df4[df4>0]

Unnamed: 0,first col,second col,third col,fourth col,fifth col
first row,,0.376734,,,
second row,1.927943,0.67627,,,
third row,,0.410228,0.355698,0.64131,1.283334
fourth row,0.219155,,1.238794,1.955222,
fifth row,0.599705,,,,0.642124


In [82]:
# add columns to dataframe or else edit previous ones if exist
df4['new col'] = np.random.randn(5,1)
df4

Unnamed: 0,first col,second col,third col,fourth col,fifth col,new col
first row,-1.339089,0.376734,-0.771253,-1.565313,-1.49805,1.059146
second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


In [86]:
# drop a row --> axis=0
df4.drop('first row',axis=0)

Unnamed: 0,first col,second col,third col,fourth col,fifth col,new col
second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


In [7]:
# drop a column --> axis=1
df4.drop(['second col','fifth col'],axis=0)

KeyError: "['second col' 'fifth col'] not found in axis"

In [90]:
df4
'''
Hence, drop returns a new dataframe without altering the original dataframe.
Thus to store we can:
df5= df4.drop()
or
df4.drop(inplace=True)
'''

Unnamed: 0,first col,second col,third col,fourth col,fifth col,new col
first row,-1.339089,0.376734,-0.771253,-1.565313,-1.49805,1.059146
second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


In [91]:
df5 = df4.drop('first row',axis=0)
df5

Unnamed: 0,first col,second col,third col,fourth col,fifth col,new col
second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


In [92]:
df6=df4.drop('new col',axis=1)
df6

Unnamed: 0,first col,second col,third col,fourth col,fifth col
first row,-1.339089,0.376734,-0.771253,-1.565313,-1.49805
second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226
third row,-0.93033,0.410228,0.355698,0.64131,1.283334
fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954
fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124


In [93]:
# removing dataframe index labels
'''
Removes our index labels but creates a new series/column as 'index'
with all index labels.
'''
df5.reset_index()

Unnamed: 0,index,first col,second col,third col,fourth col,fifth col,new col
0,second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
1,third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
2,fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
3,fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


In [94]:
df5
'''
returns a new dataframe, hence original is not altered.
we can do df5=df5.reset_index()
or inplace=True for saving the result in same dataframe
'''

Unnamed: 0,first col,second col,third col,fourth col,fifth col,new col
second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


In [95]:
# inplace=True
df5.reset_index(inplace=True)

In [96]:
df5

Unnamed: 0,index,first col,second col,third col,fourth col,fifth col,new col
0,second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
1,third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
2,fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
3,fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


In [101]:
# assigning new names to index
# Step-1: to add a new series
df5['new index']=['This','is','the','row']
df5

Unnamed: 0_level_0,index,first col,second col,third col,fourth col,fifth col,new col,new index
new index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
This,second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248,This
is,third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554,is
the,fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871,the
row,fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295,row


In [99]:
# Step-2: to set new series as index
df5.set_index('new index',inplace=True)
df5

Unnamed: 0_level_0,index,first col,second col,third col,fourth col,fifth col,new col
new index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
This,second row,1.927943,0.67627,-3.20236,-0.061329,-0.272226,-0.523248
is,third row,-0.93033,0.410228,0.355698,0.64131,1.283334,-0.949554
the,fourth row,0.219155,-0.413323,1.238794,1.955222,-1.454954,0.095871
row,fifth row,0.599705,-1.070305,-0.691377,-0.624884,0.642124,0.047295


# Combining data frames
Dataframes in pandas are combined in similar way to SQL:
1. concat
2. join
3. merge

In [110]:
df8 = pd.DataFrame(
{
    'customer':['101','102','103','104'],
    'category':['cat2','cat2','cat1','cat3'],
    'important':['yes','no','yes','yes'],
    'sales':[123,52,214,663]
}
)
df8

Unnamed: 0,customer,category,important,sales
0,101,cat2,yes,123
1,102,cat2,no,52
2,103,cat1,yes,214
3,104,cat3,yes,663


In [129]:
df9 = pd.DataFrame(
{
    'customer':['101','103','104','105'],
    'color':['yellow','green','green','blue'],
    'distance':[12,9,44,21],
    'sales':[123,214,663,331]
}
)
df9

Unnamed: 0,customer,color,distance,sales
0,101,yellow,12,123
1,103,green,9,214
2,104,green,44,663
3,105,blue,21,331


In [120]:
# CONCAT
'''
Stacks records/rows on top of each other and uses common columns from both, but if 
column is absent in another, adds it and assigns NaN
'''
# by rows --> axis=0
pd.concat([df8,df9],axis=0,sort=False) # doesn't sort column alphabetically

Unnamed: 0,customer,category,important,sales,color,distance
0,101,cat2,yes,123,,
1,102,cat2,no,52,,
2,103,cat1,yes,214,,
3,104,cat3,yes,663,,
0,101,,,123,yellow,12.0
1,103,,,214,green,9.0
2,104,,,663,green,44.0
3,105,,,331,blue,21.0


In [121]:
# by rows --> axis=0
pd.concat([df8,df9],axis=0,sort=True) # sort column alphabetically

Unnamed: 0,category,color,customer,distance,important,sales
0,cat2,,101,,yes,123
1,cat2,,102,,no,52
2,cat1,,103,,yes,214
3,cat3,,104,,yes,663
0,,yellow,101,12.0,,123
1,,green,103,9.0,,214
2,,green,104,44.0,,663
3,,blue,105,21.0,,331


In [130]:
# by columns --> axis=1
'''
Just puts dataframes next to each other, but if we specify index too then they become individual rows.
'''
pd.concat([df8,df9],axis=1,sort=False) # doesn't sort column alphabetically

Unnamed: 0,customer,category,important,sales,customer.1,color,distance,sales.1
0,101,cat2,yes,123,101,yellow,12,123
1,102,cat2,no,52,103,green,9,214
2,103,cat1,yes,214,104,green,44,663
3,104,cat3,yes,663,105,blue,21,331


In [131]:
df10 = pd.DataFrame(
{
    'customer':['101','102','103','104'],
    'category':['cat2','cat2','cat1','cat3'],
    'important':['yes','no','yes','yes'],
    'sales':[123,52,214,663]
},index=[0,1,2,3]
)

In [138]:
df11 = pd.DataFrame(
{
    'customer':['101','103','104','105'],
    'color':['yellow','green','green','blue'],
    'distance':[12,9,44,21],
    'sales':[123,214,663,331]
},index=[4,5,6,7]
)

In [135]:
pd.concat([df10,df11],axis=0,sort=False)

Unnamed: 0,customer,category,important,sales,color,distance
0,101,cat2,yes,123,,
1,102,cat2,no,52,,
2,103,cat1,yes,214,,
3,104,cat3,yes,663,,
4,101,,,123,yellow,12.0
5,103,,,214,green,9.0
6,104,,,663,green,44.0
7,105,,,331,blue,21.0


In [133]:
pd.concat([df10,df11],axis=1,sort=False)
# see NaN

Unnamed: 0,customer,category,important,sales,customer.1,color,distance,sales.1
0,101.0,cat2,yes,123.0,,,,
1,102.0,cat2,no,52.0,,,,
2,103.0,cat1,yes,214.0,,,,
3,104.0,cat3,yes,663.0,,,,
4,,,,,101.0,yellow,12.0,123.0
5,,,,,103.0,green,9.0,214.0
6,,,,,104.0,green,44.0,663.0
7,,,,,105.0,blue,21.0,331.0


# Merge and Join
Merge combines dataframes using column's values to identify common entries.

Join combines dataframes using index' values to identify common entries.


In [139]:
# MERGE
# union on `customer`
pd.merge(df10,df11,how='outer',on='customer')

Unnamed: 0,customer,category,important,sales_x,color,distance,sales_y
0,101,cat2,yes,123.0,yellow,12.0,123.0
1,102,cat2,no,52.0,,,
2,103,cat1,yes,214.0,green,9.0,214.0
3,104,cat3,yes,663.0,green,44.0,663.0
4,105,,,,blue,21.0,331.0


In [140]:
# inner on `customer` -- intersection
pd.merge(df10,df11,how='inner',on='customer')

Unnamed: 0,customer,category,important,sales_x,color,distance,sales_y
0,101,cat2,yes,123,yellow,12,123
1,103,cat1,yes,214,green,9,214
2,104,cat3,yes,663,green,44,663


In [141]:
# left on `customer` -- common and left
pd.merge(df10,df11,how='left',on='customer')

Unnamed: 0,customer,category,important,sales_x,color,distance,sales_y
0,101,cat2,yes,123,yellow,12.0,123.0
1,102,cat2,no,52,,,
2,103,cat1,yes,214,green,9.0,214.0
3,104,cat3,yes,663,green,44.0,663.0


In [142]:
# right on `customer` -- common and right
pd.merge(df10,df11,how='right',on='customer')

Unnamed: 0,customer,category,important,sales_x,color,distance,sales_y
0,101,cat2,yes,123.0,yellow,12,123
1,103,cat1,yes,214.0,green,9,214
2,104,cat3,yes,663.0,green,44,663
3,105,,,,blue,21,331


In [144]:
# JOIN
df12 = pd.DataFrame({
    'Q1':[101,102,103],
    'Q2':[201,202,203]
},index=['r0','r1','r2'])
df12

Unnamed: 0,Q1,Q2
r0,101,201
r1,102,202
r2,103,203


In [151]:
df13 = pd.DataFrame({
    'Q3':[301,302,303],
    'Q4':[401,402,403]
},index=['r0','r2','r3'])
df13

Unnamed: 0,Q3,Q4
r0,301,401
r2,302,402
r3,303,403


In [152]:
# outer
df12.join(df13,how='outer')

Unnamed: 0,Q1,Q2,Q3,Q4
r0,101.0,201.0,301.0,401.0
r1,102.0,202.0,,
r2,103.0,203.0,302.0,402.0
r3,,,303.0,403.0


In [153]:
# inner
df12.join(df13,how='inner')

Unnamed: 0,Q1,Q2,Q3,Q4
r0,101,201,301,401
r2,103,203,302,402


In [155]:
# left
df12.join(df13,how='left')

Unnamed: 0,Q1,Q2,Q3,Q4
r0,101,201,301.0,401.0
r1,102,202,,
r2,103,203,302.0,402.0


In [156]:
# right
df12.join(df13,how='right')

Unnamed: 0,Q1,Q2,Q3,Q4
r0,101.0,201.0,301,401
r2,103.0,203.0,302,402
r3,,,303,403


### More functionality of Pandas

In [160]:
df11['color'].unique() # shows unique values in a series

array(['yellow', 'green', 'blue'], dtype=object)

In [161]:
df11['color'].value_counts()  # count numbers of each value in a series

green     2
yellow    1
blue      1
Name: color, dtype: int64

In [168]:
df12.mean()    # shows mean of each column/series values which are numbers
# omits strings

Q1    102.0
Q2    202.0
dtype: float64

In [175]:
df11.columns  # column names

Index(['customer', 'color', 'distance', 'sales'], dtype='object')

In [177]:
df11.index   # row/index names

Int64Index([4, 5, 6, 7], dtype='int64')

In [202]:
# creating a dataframe by applying condition on other dataframe
# BUT INDEX SHOULD BE SAME
# Actually it extracts an index by logic
new_df = df8[(df9['customer'] != '105') & (df9['color']!='green')]
new_df

Unnamed: 0,customer,category,important,sales
0,101,cat2,yes,123
1,102,cat2,no,52
2,103,cat1,yes,214
3,104,cat3,yes,663


In [199]:
# apply functions to a particular series
print(df9['sales'].mean())
print(df9['distance'].min())

332.75
9


# Apply our defined function to the series/dataframes
1. apply()
2. applymap()

In [221]:
def profit(series):
    return series*0.5

In [204]:
# apply our function to each value in a series from dataframe
df9['sales'].apply(profit)

0     61.5
1    107.0
2    331.5
3    165.5
Name: sales, dtype: float64

In [205]:
df9['color'].apply(len) # apply built-in functions

0    6
1    5
2    5
3    4
Name: color, dtype: int64

In [208]:
df14 = df9[['distance','sales']]
df14.applymap(profit)  # apply function on each element of a dataframe

Unnamed: 0,distance,sales
0,6.0,61.5
1,4.5,107.0
2,22.0,331.5
3,10.5,165.5


In [222]:
def col_sum(column):
    return sum(column)    # sum --> adds up all entries in an iterable

In [211]:
df14.apply(col_sum)

distance      86
sales       1331
dtype: int64

In [223]:
df14.applymap(col_sum) # gives error because it will pass element by element

TypeError: 'int' object is not iterable

In [220]:
# delete a series
del df9['color']
df9

Unnamed: 0,customer,distance,sales
0,101,12,123
1,103,9,214
2,104,44,663
3,105,21,331


In [225]:
df9.sort_values(by='distance',inplace=True)
df9

Unnamed: 0,customer,distance,sales
1,103,9,214
0,101,12,123
3,105,21,331
2,104,44,663


In [226]:
# if some series has multiple of the same value then we can group all the unique entries together
mydict = {'customer': ['Customer 1','Customer 1','Customer2','Customer2','Customer3','Customer3'], 
          'product1': [1.1,2.1,3.8,4.2,5.5,6.9],
          'product2': [8.2,9.1,11.1,5.2,44.66,983]}
df6 = pd.DataFrame(mydict,index=['Purchase 1','Purchase 2','Purchase 3','Purchase 4','Purchase 5','Purchase 6'])
df6

Unnamed: 0,customer,product1,product2
Purchase 1,Customer 1,1.1,8.2
Purchase 2,Customer 1,2.1,9.1
Purchase 3,Customer2,3.8,11.1
Purchase 4,Customer2,4.2,5.2
Purchase 5,Customer3,5.5,44.66
Purchase 6,Customer3,6.9,983.0


In [227]:
'''
groups data by a particular series/column with similar values.
calculates count, mean, standard deviation, min ,max
describe() --> pivot table --> generate descriptive statistics
'''
grouped_data = df6.groupby('customer')
print(grouped_data)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D92E21A130>


In [228]:
grouped_data.describe()  # whole descriptive statistics

Unnamed: 0_level_0,product1,product1,product1,product1,product1,product1,product1,product1,product2,product2,product2,product2,product2,product2,product2,product2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
customer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Customer 1,2.0,1.6,0.707107,1.1,1.35,1.6,1.85,2.1,2.0,8.65,0.636396,8.2,8.425,8.65,8.875,9.1
Customer2,2.0,4.0,0.282843,3.8,3.9,4.0,4.1,4.2,2.0,8.15,4.17193,5.2,6.675,8.15,9.625,11.1
Customer3,2.0,6.2,0.989949,5.5,5.85,6.2,6.55,6.9,2.0,513.83,663.506577,44.66,279.245,513.83,748.415,983.0


In [231]:
grouped_data.mean()  # only a specific column from descriptive statistics

Unnamed: 0_level_0,product1,product2
customer,Unnamed: 1_level_1,Unnamed: 2_level_1
Customer 1,1.6,8.65
Customer2,4.0,8.15
Customer3,6.2,513.83


### Saving and loading data

In [233]:
df11

Unnamed: 0,customer,color,distance,sales
4,101,yellow,12,123
5,103,green,9,214
6,104,green,44,663
7,105,blue,21,331


In [249]:
# save to a CSV
df11.to_csv('df11.csv', index=True,index_label='index col') # saves to csv with our index labels in a new column

In [238]:
# read a csv
'''
If we don't specify `index_col`, it will specify it as a new column in the dataframe
and give it's own index.
'''
read_df11_csv = pd.read_csv('df11.csv',index_col=0) 
read_df11_csv

Unnamed: 0_level_0,customer,color,distance,sales
index col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,101,yellow,12,123
5,103,green,9,214
6,104,green,44,663
7,105,blue,21,331


In [250]:
# save to excel
df11.to_excel('df11.xlsx',index=True,sheet_name='first sheet',index_label='index col')

In [251]:
# read an excel
read_df11_xlsx = pd.read_excel('df11.xlsx',index_col=0,sheet_name='first sheet')
read_df11_xlsx

Unnamed: 0_level_0,customer,color,distance,sales
index col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,101,yellow,12,123
5,103,green,9,214
6,104,green,44,663
7,105,blue,21,331
