In [52]:
#import pandas and give it an alias
import numpy as np
import pandas as pd

In [53]:
#Creating a numerical pandas series 
s = pd.Series([1,2,3,4,5])
print(s)
print(type(s))

0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


Note that each element in a series has an index and is starting with 0

In [54]:
#Accessing element in a series
#Accessing the 4th element
s[3] #The element is accessed by its index

4

In [55]:
#Accessing the element starting at 3rd index
s[3:] #Notice the output of a this is also a series

3    4
4    5
dtype: int64

In [56]:
#Accessing the 2nd and the 4th elements
#Notice that s[1,4] won't work as we need to pass the indices [1,3] as a list inside the 
#original []
s[[1,4]]

1    2
4    5
dtype: int64

In [57]:
#Apply lambda function on the series
#Note : The difference between using lambda function in a series and a numpy array, is that in 
#numpy array we cannot directly use the function, but in series it can be directly used.
#Also, lambda functions are pre-dominantly used in series or dataframes.
s.apply(lambda x:x**2)

0     1
1     4
2     9
3    16
4    25
dtype: int64

# Let's go to Dataframes!!

In [58]:
#A dataframe is a row, column format of storing data.
#Every column has a header, a type. And, all the elements in a column have the same type.
#Every row corresponds to an object.

#Creating a dataframes
#There are various ways to create dataframes, such as creating from dictionary as below.
df = pd.DataFrame({'name':['Vinay','Kushal','Aman','Saif'],
                  'age':[22,35,56,49],
                  'occupation':['lieutenant','doctor','engineer','teacher']});
print(df)
print(type(df))
#The name, age and occupation are the row names of the dataframe and each row contains values.

     name  age  occupation
0   Vinay   22  lieutenant
1  Kushal   35      doctor
2    Aman   56    engineer
3    Saif   49     teacher
<class 'pandas.core.frame.DataFrame'>


In [59]:
#Notice that — while creating a series, Pandas automatically indexes it from 0 to (n-1), 
#with n being the number of rows. But, if you want, you can also explicitly set the index yourself, 
#using the ‘index’ argument — while creating the series using pd.Series():
pd.Series([1, 2, 3], index = ['a', 'b', 'c'])
#Note the index is user-defined with values as 'a','b','c'

a    1
b    2
c    3
dtype: int64

In [60]:
#Let's get the data from a csv file
market_df = pd.read_csv('C:/Users/Pratik Nath/Desktop/DS/Introduction to Pandas/global_sales_data/market_fact.csv');

In [61]:
#Looking at the top and the bottom entries of dataframes
market_df.head()
#Returns the top 5 rows

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38


In [62]:
market_df.tail()
#Returns the bottom 5 rows

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
8394,Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
8395,Ord_5411,Prod_6,SHP_7555,Cust_1798,127.16,0.1,20,-74.03,6.92,0.37
8396,Ord_5388,Prod_6,SHP_7524,Cust_1798,243.05,0.02,39,-70.85,5.35,0.4
8397,Ord_5348,Prod_15,SHP_7469,Cust_1798,3872.87,0.03,23,565.34,30.0,0.62
8398,Ord_5459,Prod_6,SHP_7628,Cust_1798,603.69,0.0,47,131.39,4.86,0.38


In [63]:
#Over-view of the dataframe
market_df.info()

#Note that each column in the dataframe is actually a pandas series of length 8399 except for last column
#having value as 8336.
#The ID columns are 'objects' i.e., they are being read as strings.
#The other columns are numeric (float or int)
#The last column i.e., Product_Base_Margin has a total value of 8336 which is less than 8399 and it 
#means there are some values which are blank as compared to other columns.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8399 entries, 0 to 8398
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Ord_id               8399 non-null   object 
 1   Prod_id              8399 non-null   object 
 2   Ship_id              8399 non-null   object 
 3   Cust_id              8399 non-null   object 
 4   Sales                8399 non-null   float64
 5   Discount             8399 non-null   float64
 6   Order_Quantity       8399 non-null   int64  
 7   Profit               8399 non-null   float64
 8   Shipping_Cost        8399 non-null   float64
 9   Product_Base_Margin  8336 non-null   float64
dtypes: float64(5), int64(1), object(4)
memory usage: 656.3+ KB


In [64]:
#Describe gives a summary(statistical values) of all the numeric columns 
#i.e, columns which only has numeric values 
#in the dataset
market_df.describe()

Unnamed: 0,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
count,8399.0,8399.0,8399.0,8399.0,8399.0,8336.0
mean,1775.878179,0.049671,25.571735,181.184424,12.838557,0.512513
std,3585.050525,0.031823,14.481071,1196.653371,17.264052,0.135589
min,2.24,0.0,1.0,-14140.7,0.49,0.35
25%,143.195,0.02,13.0,-83.315,3.3,0.38
50%,449.42,0.05,26.0,-1.5,6.07,0.52
75%,1709.32,0.08,38.0,162.75,13.99,0.59
max,89061.05,0.25,50.0,27220.69,164.73,0.85


In [65]:
#List out the column names
market_df.columns

Index(['Ord_id', 'Prod_id', 'Ship_id', 'Cust_id', 'Sales', 'Discount',
       'Order_Quantity', 'Profit', 'Shipping_Cost', 'Product_Base_Margin'],
      dtype='object')

In [66]:
#Shape of the dataframe
market_df.shape

(8399, 10)

In [67]:
#One can extract all the values of a dataframe as a numpy array using df.values
market_df.values

array([['Ord_5446', 'Prod_16', 'SHP_7609', ..., -30.51, 3.6, 0.56],
       ['Ord_5406', 'Prod_13', 'SHP_7549', ..., 4.56, 0.93, 0.54],
       ['Ord_5446', 'Prod_4', 'SHP_7610', ..., 1148.9, 2.5, 0.59],
       ...,
       ['Ord_5388', 'Prod_6', 'SHP_7524', ..., -70.85, 5.35, 0.4],
       ['Ord_5348', 'Prod_15', 'SHP_7469', ..., 565.34, 30.0, 0.62],
       ['Ord_5459', 'Prod_6', 'SHP_7628', ..., 131.39, 4.86, 0.38]],
      dtype=object)

In [68]:
market_df.head()

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38


In the above, we can notice that the Order_id is has a more meaningful value and 
it can be used to fetch and identify the data in the row more better.
Let's change the index of Ord_id (unique id of each order), so that one can select rows using the order id directly

Meaningful row labels help to select(subset) dataframes easily.

In [69]:
#Setting index to Ord_id
market_df.set_index('Ord_id', inplace=True)

In [70]:
market_df.head()

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56
Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54
Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38


### Let's sort the dataframes

In [71]:
#One can sort the dataframes either by 1) index or by the 2) values

#Let's the sorting by index
market_df.sort_index(ascending=False)

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_999,Prod_15,SHP_1383,Cust_361,5661.08,0.00,33,1055.47,30.00,0.62
Ord_998,Prod_8,SHP_1380,Cust_372,750.66,0.00,33,120.05,4.00,0.60
Ord_998,Prod_5,SHP_1382,Cust_372,2149.37,0.03,42,217.87,19.99,0.55
Ord_998,Prod_8,SHP_1381,Cust_372,254.32,0.01,8,-117.39,6.50,0.79
Ord_997,Prod_14,SHP_1379,Cust_365,28761.52,0.04,8,285.11,24.49,0.37
...,...,...,...,...,...,...,...,...,...
Ord_1001,Prod_5,SHP_1385,Cust_374,1981.26,0.07,49,100.80,8.66,0.76
Ord_1000,Prod_6,SHP_1384,Cust_373,334.71,0.01,25,31.74,6.47,0.38
Ord_100,Prod_8,SHP_138,Cust_58,121.12,0.10,3,-118.82,1.99,0.44
Ord_10,Prod_3,SHP_13,Cust_10,80.61,0.02,15,-4.72,2.99,0.37


In [72]:
#Sorting can also be done by the values
#Let's do a sorting in increasing order of the sales
market_df.sort_values(by='Sales').head()

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_704,Prod_7,SHP_964,Cust_242,2.24,0.01,1,-1.97,0.7,0.37
Ord_149,Prod_3,SHP_7028,Cust_1712,3.2,0.09,1,-3.16,1.49,0.37
Ord_4270,Prod_7,SHP_5959,Cust_1450,3.23,0.06,2,-2.73,0.7,0.81
Ord_4755,Prod_13,SHP_6628,Cust_1579,3.41,0.06,1,-1.78,0.7,0.56
Ord_2252,Prod_3,SHP_3064,Cust_881,3.42,0.05,1,-2.91,1.49,0.37


In [73]:
#Sorting in decreasing order of the sales
market_df.sort_values(by='Shipping_Cost', ascending=False).head()

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_1751,Prod_15,SHP_2426,Cust_597,14740.51,0.0,46,3407.73,164.73,0.56
Ord_839,Prod_11,SHP_1361,Cust_364,12689.87,0.04,44,-169.23,154.12,0.76
Ord_1741,Prod_11,SHP_2411,Cust_595,15168.82,0.02,26,-1096.78,147.12,0.8
Ord_417,Prod_11,SHP_561,Cust_156,20333.816,0.02,45,-1430.45,147.12,0.8
Ord_1581,Prod_15,SHP_2184,Cust_519,2573.92,0.07,17,117.23,143.71,0.55


In [74]:
#Sorting can also be done for more than one column
#Sorting in ascending order of Sales for each product
market_df.sort_values(by=['Prod_id','Sales'],ascending=True).head()
#Notice the order of the sorting is in reverse order i.e., 'Sales' first and then 'Prod_id'

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_286,Prod_1,SHP_387,Cust_90,18.15,0.04,1,-7.26,6.13,0.57
Ord_2314,Prod_1,SHP_3171,Cust_899,18.16,0.03,1,-7.25,6.13,0.57
Ord_439,Prod_1,SHP_587,Cust_136,18.73,0.05,1,-6.68,6.13,0.57
Ord_2746,Prod_1,SHP_3767,Cust_1030,22.61,0.03,1,-8.4,7.51,0.57
Ord_3713,Prod_1,SHP_5145,Cust_1307,27.83,0.09,2,-22.14,9.45,0.6


In [75]:
forest_df = pd.read_csv('C:/Users/Pratik Nath/Desktop/DS/forestfires.csv')

In [76]:
forest_df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [77]:
forest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [78]:
forest_df.shape

(517, 13)

In [79]:
#Sort the dataframe on 'month' and 'day' in ascending order
forest_df.sort_values(by=['month','day'], ascending=True).head()
#Notice the sorting of day is done in alphabetic order

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
241,4,4,apr,fri,83.0,23.3,85.3,2.3,16.7,20,3.1,0.0,0.0
442,6,5,apr,mon,87.9,24.9,41.6,3.7,10.9,64,3.1,0.0,3.35
19,6,4,apr,sat,86.3,27.4,97.1,5.1,9.3,44,4.5,0.0,0.0
239,7,5,apr,sun,81.9,3.0,7.9,3.5,13.4,75,1.8,0.0,0.0
469,6,3,apr,sun,91.0,14.6,25.6,12.3,13.7,33,9.4,0.0,61.13


In [80]:
#Selecting a set of rows let's say from indices 2 to 6
forest_df[2:7]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0
5,8,6,aug,sun,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0
6,8,6,aug,mon,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0


In [81]:
#Selecting alternate rows starting from index let's say 4
forest_df[5::2].head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
5,8,6,aug,sun,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0
7,8,6,aug,mon,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0
9,7,5,sep,sat,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.0
11,7,5,sep,sat,92.8,73.2,713.0,22.6,19.3,38,4.0,0.0,0.0
13,6,5,sep,mon,90.9,126.5,686.5,7.0,21.3,42,2.2,0.0,0.0


### Position (Integer) Based Indexing

#### Pandas provides the *df.iloc()* functionality to index dataframes using integer indices 

In [82]:
help(pd.DataFrame.iloc)

Help on property:

    Purely integer-location based indexing for selection by position.
    
    ``.iloc[]`` is primarily integer position based (from ``0`` to
    ``length-1`` of the axis), but may also be used with a boolean
    array.
    
    Allowed inputs are:
    
    - An integer, e.g. ``5``.
    - A list or array of integers, e.g. ``[4, 3, 0]``.
    - A slice object with ints, e.g. ``1:7``.
    - A boolean array.
    - A ``callable`` function with one argument (the calling Series or
      DataFrame) and that returns valid output for indexing (one of the above).
      This is useful in method chains, when you don't have a reference to the
      calling object, but would like to base your selection on some value.
    
    ``.iloc`` will raise ``IndexError`` if a requested indexer is
    out-of-bounds, except *slice* indexers which allow out-of-bounds
    indexing (this conforms with python/numpy *slice* semantics).
    
    See more at :ref:`Selection by Position <indexing.inte

In [83]:
#Selecting a single element
#Note that 2, 4 corresponds to the third row and fifth column
forest_df.iloc[2,4] #Notice that 4 is not included and is actually means to the 5th column

90.6

In [84]:
#forest_df[2,4]
#The above throws an error as pandas get confused whether 2 is an integer index(the third row),
#or is it a row with label=2
#The confusion is cleared by the iloc[2,4] method which specifically tells that it should assume integer indices

In [85]:
# Selecting a single row, and all columns
# Select the 6th row, with label (and index) = 5
forest_df.iloc[5]

X            8
Y            6
month      aug
day        sun
FFMC      92.3
DMC       85.3
DC       488.0
ISI       14.7
temp      22.2
RH          29
wind       5.4
rain       0.0
area       0.0
Name: 5, dtype: object

In [86]:
#The above can also be equivalently written as,
forest_df.iloc[5,:] #The ":" indicates all rows/columns

X            8
Y            6
month      aug
day        sun
FFMC      92.3
DMC       85.3
DC       488.0
ISI       14.7
temp      22.2
RH          29
wind       5.4
rain       0.0
area       0.0
Name: 5, dtype: object

In [87]:
#Multiple rows using a list of indices and 5th column values
forest_df.iloc[[3,7,8],5]

3     33.3
7    145.4
8    129.5
Name: DMC, dtype: float64

In [88]:
#Selecting a single column
#Notice the column index starts at 0, and 2 represents the third column
#Also, ":" represents all the rows
forest_df.iloc[:,2]

0      mar
1      oct
2      oct
3      mar
4      mar
      ... 
512    aug
513    aug
514    aug
515    aug
516    nov
Name: month, Length: 517, dtype: object

In [89]:
#Selecting multiple columns i.e., 5 columns are selected as (8-3)
forest_df.iloc[:,3:8]

Unnamed: 0,day,FFMC,DMC,DC,ISI
0,fri,86.2,26.2,94.3,5.1
1,tue,90.6,35.4,669.1,6.7
2,sat,90.6,43.7,686.9,6.7
3,fri,91.7,33.3,77.5,9.0
4,sun,89.3,51.3,102.2,9.6
...,...,...,...,...,...
512,sun,81.6,56.7,665.6,1.9
513,sun,81.6,56.7,665.6,1.9
514,sun,81.6,56.7,665.6,1.9
515,sat,94.4,146.0,614.7,11.3


In [90]:
#Selecting multiple rows and columns
forest_df.iloc[3:8,2:5]

Unnamed: 0,month,day,FFMC
3,mar,fri,91.7
4,mar,sun,89.3
5,aug,sun,92.3
6,aug,mon,92.3
7,aug,mon,91.5


In [91]:
#Selecting the rows corresponding to True
#forest_df.iloc[[True,False,True,True,True]]

## Label Based Indexing

### Pandas provides the *df.loc[]* functionality to index dataframes using labels.

In [92]:
#forest_df.loc[2:,1] #Select all rows starting from 2(inclusive)
#The above will give error if column is mentioned in numbers i.e., "1" will not mean anything to pandas as to
#which column its referring to.
#Hence, an explicit name should be given

In [93]:
forest_df.loc[2:,"X"] #Here, "X" refers to the column name

2      7
3      8
4      8
5      8
6      8
      ..
512    4
513    2
514    7
515    1
516    6
Name: X, Length: 515, dtype: int64

In [94]:
#Selecting rows using a range of labels
#Here, both 4 and 8 are inclusive, unlike in iloc where '8' is excluded
forest_df.loc[4:8]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0
5,8,6,aug,sun,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0
6,8,6,aug,mon,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0
7,8,6,aug,mon,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0
8,8,6,sep,tue,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0


In [95]:
forest_df.loc[45,'month']

'sep'

## Subsetting Rows based on Conditions

In [96]:
forest_df.FFMC>80

0       True
1       True
2       True
3       True
4       True
       ...  
512     True
513     True
514     True
515     True
516    False
Name: FFMC, Length: 517, dtype: bool

In [97]:
forest_df.loc[forest_df.FFMC>80] #Gives all the columns having condition as mentioned

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,8,6,aug,sun,81.6,56.7,665.6,1.9,27.8,35,2.7,0.0,0.00
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16


In [98]:
#Multiple conditions can be applied
forest_df.loc[(forest_df.FFMC>80) & (forest_df.DC<101)]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
16,5,5,mar,sat,91.7,35.8,80.8,7.8,15.1,27,5.4,0.0,0.00
18,6,4,mar,wed,89.2,27.9,70.8,6.3,15.9,35,4.0,0.0,0.00
19,6,4,apr,sat,86.3,27.4,97.1,5.1,9.3,44,4.5,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,3,4,mar,wed,90.2,18.5,41.1,7.3,11.2,41,5.4,0.0,5.55
468,6,5,mar,thu,91.3,20.6,43.5,8.5,13.3,27,3.6,0.0,6.61
469,6,3,apr,sun,91.0,14.6,25.6,12.3,13.7,33,9.4,0.0,61.13
470,5,4,apr,sun,91.0,14.6,25.6,12.3,17.6,27,5.8,0.0,0.00


In [99]:
#Multiple conditions can be applied and selecting specific column
forest_df.loc[(forest_df.FFMC>80) & (forest_df.DC<101),['DC','temp']]

Unnamed: 0,DC,temp
0,94.3,8.2
3,77.5,8.3
16,80.8,15.1
18,70.8,15.9
19,97.1,9.3
...,...,...
467,41.1,11.2
468,43.5,13.3
469,25.6,13.7
470,25.6,17.6


In [100]:
values_of_x = [7,5,8]
forest_df.loc[forest_df.X.isin(values_of_x)]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,8,6,aug,tue,96.1,181.1,671.2,14.3,21.6,65,4.9,0.8,0.00
501,7,5,aug,tue,96.1,181.1,671.2,14.3,21.6,65,4.9,0.8,0.00
509,5,4,aug,fri,91.0,166.9,752.6,7.1,21.1,71,7.6,1.4,2.17
511,8,6,aug,sun,81.6,56.7,665.6,1.9,27.8,35,2.7,0.0,0.00


## Merging and Concatenation of Data

In [101]:
market_df.head()

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56
Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54
Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38


In [102]:
forest_df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [104]:
##Merging the Dataframes - done on the columns and on the column which is common to both the dataframes
#df_merged = pd.merge(market_df, forest_df, how = 'inner',on='Sales')

In [105]:
#Concatenation of Dataframes - done between DFs which have exactly same columns and column names.
forest_df2 = forest_df

In [106]:
forest_df2.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [109]:
pd.concat([forest_df,forest_df2],axis=1) #One in side to other

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,...,day.1,FFMC.1,DMC.1,DC.1,ISI.1,temp.1,RH.1,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,...,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,...,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,...,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,...,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,...,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,...,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,...,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,...,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,...,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [110]:
pd.concat([forest_df,forest_df2],axis=0) #One on top of other

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


## Grouping and Summarizing

In [112]:
#Operations similar to SQL's where, group by etc is done by these operations.
forestByDay = forest_df.groupby('day')

In [113]:
forestByDay #It just shows the object.

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000216CA01F8B0>

In [116]:
forestByDay['FFMC'].sum() #Sum of FFMC grouped by days of the week

day
fri    7725.2
mon    6648.4
sat    7593.5
sun    8512.4
thu    5595.3
tue    5811.8
wed    4976.7
Name: FFMC, dtype: float64

In [118]:
forestByDay.sum() #Gives the sum of all the numerical columns

Unnamed: 0_level_0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fri,388,354,7725.2,9276.9,46374.5,807.6,1525.6,3966,382.0,1.6,447.24
mon,362,323,6648.4,6954.3,38169.7,536.8,1254.4,3305,276.6,0.0,706.53
sat,403,362,7593.5,9268.7,44363.8,724.3,1625.3,3646,315.7,0.0,2144.86
sun,430,410,8512.4,10857.7,51983.3,854.1,1811.2,4652,391.7,1.0,959.93
thu,285,268,5595.3,7698.8,35567.8,533.3,1202.0,2367,226.4,0.0,997.1
tue,288,264,5811.8,7096.0,36265.2,630.6,1244.1,2795,273.4,8.4,807.79
wed,258,242,4976.7,6168.6,30560.7,577.5,1103.1,2166,211.3,0.2,578.6


In [121]:
#The group by object created can be stored in a dataframe
forestByDay_df = pd.DataFrame(forestByDay.sum())

In [122]:
forestByDay_df

Unnamed: 0_level_0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fri,388,354,7725.2,9276.9,46374.5,807.6,1525.6,3966,382.0,1.6,447.24
mon,362,323,6648.4,6954.3,38169.7,536.8,1254.4,3305,276.6,0.0,706.53
sat,403,362,7593.5,9268.7,44363.8,724.3,1625.3,3646,315.7,0.0,2144.86
sun,430,410,8512.4,10857.7,51983.3,854.1,1811.2,4652,391.7,1.0,959.93
thu,285,268,5595.3,7698.8,35567.8,533.3,1202.0,2367,226.4,0.0,997.1
tue,288,264,5811.8,7096.0,36265.2,630.6,1244.1,2795,273.4,8.4,807.79
wed,258,242,4976.7,6168.6,30560.7,577.5,1103.1,2166,211.3,0.2,578.6


In [124]:
#Grouping by can also be done on multiple columns 
forestByDayAndarea = forest_df.groupby(['day','area'])
forestByDayAndarea.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain
day,area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fri,0.00,182,172,3794.3,4171.7,20285.2,382.9,768.2,1854,185.5,0.2
fri,0.33,1,3,91.1,91.3,738.1,7.2,19.1,46,2.2,0.0
fri,0.43,6,5,91.0,166.9,752.6,7.1,18.2,62,5.4,0.0
fri,0.52,7,4,88.2,55.2,732.3,11.6,15.2,64,3.1,0.0
fri,0.61,8,6,90.1,108.0,529.8,12.5,21.2,51,8.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...
wed,37.71,7,4,90.1,82.9,735.7,6.2,15.4,57,4.5,0.0
wed,49.59,4,3,94.5,139.4,689.1,20.0,28.9,29,4.9,0.0
wed,82.75,1,4,91.7,191.4,635.9,7.8,19.9,50,4.0,0.0
wed,88.49,4,4,92.9,133.3,699.6,9.2,26.4,21,4.5,0.0


In [125]:
#Describe function gives the statistical aggregates
forestByDay.describe()

Unnamed: 0_level_0,X,X,X,X,X,X,X,X,Y,Y,...,rain,rain,area,area,area,area,area,area,area,area
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
fri,85.0,4.564706,2.206306,1.0,3.0,5.0,6.0,9.0,85.0,4.164706,...,0.0,1.4,85.0,5.261647,10.012083,0.0,0.0,0.33,5.97,43.32
mon,74.0,4.891892,2.326545,1.0,3.0,5.0,7.0,9.0,74.0,4.364865,...,0.0,0.0,74.0,9.547703,33.703562,0.0,0.0,0.745,6.0325,278.53
sat,84.0,4.797619,2.383577,1.0,3.0,5.0,7.0,9.0,84.0,4.309524,...,0.0,0.0,84.0,25.534048,122.69884,0.0,0.0,0.34,7.55,1090.84
sun,95.0,4.526316,2.342222,1.0,2.0,4.0,7.0,9.0,95.0,4.315789,...,0.0,1.0,95.0,10.104526,26.076032,0.0,0.0,0.0,6.815,196.48
thu,61.0,4.672131,2.378524,1.0,3.0,4.0,6.0,9.0,61.0,4.393443,...,0.0,0.0,61.0,16.345902,95.351052,0.0,0.0,0.9,4.95,746.28
tue,64.0,4.5,2.295613,1.0,2.75,4.0,6.0,9.0,64.0,4.125,...,0.0,6.4,64.0,12.621719,33.568193,0.0,0.0,0.655,8.85,212.88
wed,54.0,4.777778,2.336476,1.0,3.0,4.0,6.75,9.0,54.0,4.481481,...,0.0,0.2,54.0,10.714815,30.285914,0.0,0.0,0.76,5.7825,185.76
