# Pandas

pandas (all lowercase) is a popular python-based data analysis toolkit which can be imported using import pandas as pd. It presents a diverse range of utilities, ranging from parsing multiple file formats to converting an entire data table into a NumPy matrix array. This makes pandas a trusted ally in data science and machine learning.

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore') 
# This is to remove unnecessary warning when using pandas as this has nothing to do with data analysis its just used to removie annoying warnings.

In [4]:
ser = pd.Series()

ser
#print ('{}\n'.format(ser))

Series([], dtype: float64)

In [5]:
ser = pd.Series([0,10,20,30,40,50,60,70,80,90])

ser 
# In pandas bydefault you get the datas index.

0     0
1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
9    90
dtype: int64

In [6]:
# Notice Datatype
ser = pd.Series([20,15.7,'16'])

ser

0      20
1    15.7
2      16
dtype: object

In [7]:
# create Dictionary
dict = {'A':[1,2,3],'B':[4,5,6],'C':[7,8,9]}


In [8]:
df = pd.DataFrame(dict)
df

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [9]:
# Add new column and add values into it
df['D']="One Two Ten".split()

In [10]:
df

Unnamed: 0,A,B,C,D
0,1,4,7,One
1,2,5,8,Two
2,3,6,9,Ten


In [11]:
df['E']=[10,25,50]
df

Unnamed: 0,A,B,C,D,E
0,1,4,7,One,10
1,2,5,8,Two,25
2,3,6,9,Ten,50


In [12]:
# How to create NaN values and its manipulation
dict = {'A':[1,2,np.nan],'B':[3,np.nan,np.nan],'C':[4,5,6],'D':[8,9,10]}

In [13]:
df = pd.DataFrame(dict)    #NaN is a blank data which we are putting purposefully
df

Unnamed: 0,A,B,C,D
0,1.0,3.0,4,8
1,2.0,,5,9
2,,,6,10


In [14]:
print("\nDropping any column with a NaN value\n",'-'*35)

print(df.dropna(axis=1))  # to remove any Column containing NaN values
# axis=1 means column & axis=0 is raw


Dropping any column with a NaN value
 -----------------------------------
   C   D
0  4   8
1  5   9
2  6  10


In [15]:
df       # As you can see dropna method will only work once & it will not make any permanent changes in data

# inplace=True will permanently save changes

Unnamed: 0,A,B,C,D
0,1.0,3.0,4,8
1,2.0,,5,9
2,,,6,10


In [16]:
print("\nFilling value with a default value\n",'-'*35)    # Method to fill values in NaN
print(df.fillna(value=50))


Filling value with a default value
 -----------------------------------
      A     B  C   D
0   1.0   3.0  4   8
1   2.0  50.0  5   9
2  50.0  50.0  6  10


In [17]:
df   # As you can see inplace method will only work once & it will not make any permanent changes in data

# inplace=True will permanently save changes

Unnamed: 0,A,B,C,D
0,1.0,3.0,4,8
1,2.0,,5,9
2,,,6,10


In [19]:
print("\nfilling values with a computed value (mean of column C here)\n",'-'*60, sep='')
df.fillna(value=df['C'].mean(),inplace=True)
print(df)

#sep='' mean to make space in between '-'*60
#.mean() is a way of selecting a mean value from a data
# inplace=True will permanently save changes


filling values with a computed value (mean of column C here)
------------------------------------------------------------
     A    B  C   D
0  1.0  3.0  4   8
1  2.0  5.0  5   9
2  5.0  5.0  6  10


In [20]:
df # inplace=True has made permanent changes

Unnamed: 0,A,B,C,D
0,1.0,3.0,4,8
1,2.0,5.0,5,9
2,5.0,5.0,6,10


In [21]:
# Creating a DataFrame
data = {'Company':['Google','Google','Microsoft','Microsoft','FaceBook','FaceBook'],
        'Person':['Demon','Lucifer','Devil','Beelzebub','Leviathan','Satan'],
        'Sales':[666,333,999,369,963,639],
        'Profit':[33,66,99,96,69,63]}


df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales,Profit
0,Google,Demon,666,33
1,Google,Lucifer,333,66
2,Microsoft,Devil,999,99
3,Microsoft,Beelzebub,369,96
4,FaceBook,Leviathan,963,69
5,FaceBook,Satan,639,63


# GroupBy

In [23]:
avg_Sales = df.groupby('Company',as_index=False)['Sales'].mean()  # as_Index = To print index

print("\nGrouping by 'Company' column and listing mean sales\n",'-'*55)

print(avg_Sales)


Grouping by 'Company' column and listing mean sales
 -------------------------------------------------------
     Company  Sales
0   FaceBook  801.0
1     Google  499.5
2  Microsoft  684.0


In [24]:
total_Sales = df.groupby('Company')[['Sales','Profit']].sum()

print("\nGrouping by 'Company column and listing sum of sales and profit'\n",'-'*55)
print (total_Sales)



Grouping by 'Company column and listing sum of sales and profit'
 -------------------------------------------------------
           Sales  Profit
Company                 
FaceBook    1602     132
Google       999      99
Microsoft   1368     195


In [27]:
sales_stats = df.groupby('Company')[['Sales','Profit']].agg(['mean','sum']) 
print("\nGrouping by 'Company' column and both sum and average sales\n",'-'*55)
print (sales_stats)

# .agg = aggregating mean+sum of a data at the same time


Grouping by 'Company' column and both sum and average sales
 -------------------------------------------------------
           Sales       Profit     
            mean   sum   mean  sum
Company                           
FaceBook   801.0  1602   66.0  132
Google     499.5   999   49.5   99
Microsoft  684.0  1368   97.5  195


In [28]:
# Finding the Largest value in each group. This produce sales corresponding index
df.groupby('Company')['Sales'].nlargest(1) # '1' is top 1.

Company     
FaceBook   4    963
Google     0    666
Microsoft  2    999
Name: Sales, dtype: int64

# Some other concepts
- Concat
- Merge
- Join

### Concating dataframes

In [34]:
# Creating data frames

Class_a = pd.DataFrame({'Math': [45,36,95,66],
                        'Eng': [54,65,45,31],
                        'Sci': [56,66,52,21],
                        'Chem': [66,33,21,54]},
                       index=['Jack','Jones','Jimmy','Jessy'])

Class_a

Unnamed: 0,Math,Eng,Sci,Chem
Jack,45,54,56,66
Jones,36,65,66,33
Jimmy,95,45,52,21
Jessy,66,31,21,54


In [35]:
Class_b = pd.DataFrame({'Math': [44,26,65,55],
                        'Eng': [64,55,85,84],
                        'Sci': [56,55,66,77],
                        'Chem': [46,53,31,42]}, 
                        index=['Rick','Ronald','Richie','Ralf'])

Class_b

Unnamed: 0,Math,Eng,Sci,Chem
Rick,44,64,56,46
Ronald,26,55,55,53
Richie,65,85,66,31
Ralf,55,84,77,42


In [36]:
Class_c = pd.DataFrame({'Math': [56,36,15,45],
                        'Eng': [65,45,75,36],
                        'Sci': [46,25,36,45],
                        'Chem': [16,63,41,52]},
                        index=['Debby','Danny','Drue','Dorny'])

Class_c

Unnamed: 0,Math,Eng,Sci,Chem
Debby,56,65,46,16
Danny,36,45,25,63
Drue,15,75,36,41
Dorny,45,36,45,52


In [37]:
# Concatenation

all_students_marks = pd.concat([Class_a,Class_b,Class_c], axis=0)

print("\nAfter concatenation along row\n",'-'*30, sep='')

print(all_students_marks)


After concatenation along row
------------------------------
        Math  Eng  Sci  Chem
Jack      45   54   56    66
Jones     36   65   66    33
Jimmy     95   45   52    21
Jessy     66   31   21    54
Rick      44   64   56    46
Ronald    26   55   55    53
Richie    65   85   66    31
Ralf      55   84   77    42
Debby     56   65   46    16
Danny     36   45   25    63
Drue      15   75   36    41
Dorny     45   36   45    52


### Merge

In [43]:
Player_id_name = pd.DataFrame({'ID': [25,30,35,40,45,50,55],
                               'Name':['Ronaldo','Messi','Pele','Maradona','Harry','Pogba','Chhetri'],
                              })

print ('\nPlayer ID and Name \n', '='*18)
Player_id_name


Player ID and Name 


Unnamed: 0,ID,Name
0,25,Ronaldo
1,30,Messi
2,35,Pele
3,40,Maradona
4,45,Harry
5,50,Pogba
6,55,Chhetri


In [47]:
player_details = pd.DataFrame({'ID': [25,30,45,50,55],
                               'Age':[36,34,28,28,37],
                               'Country': ['Portugal','Argentina','England','France','India']
                              })

print ('Player details \n', '='*18)

player_details

Player details 


Unnamed: 0,ID,Age,Country
0,25,36,Portugal
1,30,34,Argentina
2,45,28,England
3,50,28,France
4,55,37,India


In [48]:
print ("===Datasets will be merged based on the matching ID's===")

pd.merge(Player_id_name,player_details,on='ID')  # on= only the matching data from both dataframe will merge.

===Datasets will be merged based on the matching ID's===


Unnamed: 0,ID,Name,Age,Country
0,25,Ronaldo,36,Portugal
1,30,Messi,34,Argentina
2,45,Harry,28,England
3,50,Pogba,28,France
4,55,Chhetri,37,India


# Join

In [50]:
# join operators

left = pd.DataFrame({'A': ['a0','a1','a2'],
                     'B': ['b0','b1','b2']}, 
                    index= ['k0','k1','k2'])

right = pd.DataFrame({'C': ['c0','c1','c2'],
                      'D': ['d0','d1','d2']}, 
                     index= ['k0','k2','k3'])

In [51]:
left

Unnamed: 0,A,B
k0,a0,b0
k1,a1,b1
k2,a2,b2


In [52]:
right

Unnamed: 0,C,D
k0,c0,d0
k2,c1,d1
k3,c2,d2


In [53]:
left.join(right)

Unnamed: 0,A,B,C,D
k0,a0,b0,c0,d0
k1,a1,b1,,
k2,a2,b2,c1,d1


In [54]:
left.join(right, how='outer') # How is default assign to left so we are assigning outer to it that will show/print all indexes

Unnamed: 0,A,B,C,D
k0,a0,b0,c0,d0
k1,a1,b1,,
k2,a2,b2,c1,d1
k3,,,c2,d2


# applying function in pandas


In [3]:
df = pd.DataFrame({'Dept': [1,2,3,4,5,6,7,8,9,10], 
                   'Emp_name': ['Rajesh', 'Shankar', 'Sindhu', 'Akshay', 'Vivaan', 'Monika', 'Salman', 'Aisha', 'Remo', 'Amrit'], 
                   'Salary': [4995,4524,4523,6648,5462,6548,5123,4652,6645,4541]})

df

Unnamed: 0,Dept,Emp_name,Salary
0,1,Rajesh,4995
1,2,Shankar,4524
2,3,Sindhu,4523
3,4,Akshay,6648
4,5,Vivaan,5462
5,6,Monika,6548
6,7,Salman,5123
7,8,Aisha,4652
8,9,Remo,6645
9,10,Amrit,4541


In [7]:
# Define a function

def salary_hike(x): 
    if (x > 4000): 
        return np.ceil(x + (x * .30))  # .30 = 30%
    else: 
        return np.ceil(x * 1.5)


In [8]:
df['New_salary'] = df['Salary'].apply(salary_hike)
print(df)

   Dept Emp_name  Salary  New_salary
0     1   Rajesh    4995      6494.0
1     2  Shankar    4524      5882.0
2     3   Sindhu    4523      5880.0
3     4   Akshay    6648      8643.0
4     5   Vivaan    5462      7101.0
5     6   Monika    6548      8513.0
6     7   Salman    5123      6660.0
7     8    Aisha    4652      6048.0
8     9     Remo    6645      8639.0
9    10    Amrit    4541      5904.0


In [9]:
# Increased salary
df['Increased_Salary'] = df['New_salary'] - df['Salary']

df

Unnamed: 0,Dept,Emp_name,Salary,New_salary,Increased_Salary
0,1,Rajesh,4995,6494.0,1499.0
1,2,Shankar,4524,5882.0,1358.0
2,3,Sindhu,4523,5880.0,1357.0
3,4,Akshay,6648,8643.0,1995.0
4,5,Vivaan,5462,7101.0,1639.0
5,6,Monika,6548,8513.0,1965.0
6,7,Salman,5123,6660.0,1537.0
7,8,Aisha,4652,6048.0,1396.0
8,9,Remo,6645,8639.0,1994.0
9,10,Amrit,4541,5904.0,1363.0


In [10]:
print("\nSum of the colum 'New_salary' is: ", df['New_salary'].sum())


Sum of the colum 'New_salary' is:  69764.0


In [11]:
print("Min and Max of the column 'New_salary' are: ", df['New_salary'].min(),"and",df['New_salary'].max())

Min and Max of the column 'New_salary' are:  5880.0 and 8643.0


### Deletion, Sorting, List of column and row names

In [12]:
print("\nName of column\n",'-'*20)

print(df.columns)


Name of column
 --------------------
Index(['Dept', 'Emp_name', 'Salary', 'New_salary', 'Increased_Salary'], dtype='object')


In [13]:
 l = list(df.columns)
print("\nColumn names in a list of strings for later manipulation\n", l)


Column names in a list of strings for later manipulation
 ['Dept', 'Emp_name', 'Salary', 'New_salary', 'Increased_Salary']


In [15]:
print("\nDeleting column\n", '-'*50)

df.drop('Salary',axis=1,inplace=True)
df


Deleting column
 --------------------------------------------------


Unnamed: 0,Dept,Emp_name,New_salary,Increased_Salary
0,1,Rajesh,6494.0,1499.0
1,2,Shankar,5882.0,1358.0
2,3,Sindhu,5880.0,1357.0
3,4,Akshay,8643.0,1995.0
4,5,Vivaan,7101.0,1639.0
5,6,Monika,8513.0,1965.0
6,7,Salman,6660.0,1537.0
7,8,Aisha,6048.0,1396.0
8,9,Remo,8639.0,1994.0
9,10,Amrit,5904.0,1363.0


In [17]:
# You can also delete by command.

print("\nDeleting column by 'del' command\n",'-'*50)
del df['Dept']
df


Deleting column by 'del' command
 --------------------------------------------------


Unnamed: 0,Emp_name,New_salary,Increased_Salary
0,Rajesh,6494.0,1499.0
1,Shankar,5882.0,1358.0
2,Sindhu,5880.0,1357.0
3,Akshay,8643.0,1995.0
4,Vivaan,7101.0,1639.0
5,Monika,8513.0,1965.0
6,Salman,6660.0,1537.0
7,Aisha,6048.0,1396.0
8,Remo,8639.0,1994.0
9,Amrit,5904.0,1363.0


In [18]:
df.sort_values(by='New_salary',ascending=False) # Ascending=True by default

Unnamed: 0,Emp_name,New_salary,Increased_Salary
3,Akshay,8643.0,1995.0
8,Remo,8639.0,1994.0
5,Monika,8513.0,1965.0
4,Vivaan,7101.0,1639.0
6,Salman,6660.0,1537.0
0,Rajesh,6494.0,1499.0
7,Aisha,6048.0,1396.0
9,Amrit,5904.0,1363.0
1,Shankar,5882.0,1358.0
2,Sindhu,5880.0,1357.0
