# Pandas

Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

pandas is a NumFOCUS sponsored project. This will help ensure the success of development of pandas as a world-class open-source project, and makes it possible to donate to the project.

Pandas is used for data manipulation, analysis and cleaning. Python pandas is well suited for different kinds of data, such as: 

<ul><li>Tabular data with heterogeneously-typed columns</li>
    <li>Ordered and unordered time series data</li>
    <li>Arbitrary matrix data with row & column labels</li>
    <li>Unlabelled data</li>
    <li>Any other form of observational or statistical data sets</li></ul>

# Python Operations

In [1]:
import pandas as pd
 
XYZ_web= {'Day':[1,2,3,4,5,6], "Visitors":[1000, 700,6000,1000,400,350], "Bounce_Rate":[20,20, 23,15,10,34]}
 
df= pd.DataFrame(XYZ_web)
 
print(df)

   Day  Visitors  Bounce_Rate
0    1      1000           20
1    2       700           20
2    3      6000           23
3    4      1000           15
4    5       400           10
5    6       350           34


# create Pandas Series

In [3]:
import pandas as pd
a = pd.Series([1,2,3,4,5])
print(a)
print("\n")
print(type(a))

0    1
1    2
2    3
3    4
4    5
dtype: int64


<class 'pandas.core.series.Series'>


In [4]:
a[2]

3

In [5]:
a = pd.Series(['a','b','c'])
print(a)

0    a
1    b
2    c
dtype: object


In [8]:
a = pd.date_range(start = '01-01-2018', end = '23-5-2018')
print(a)
print("\n")
print(type(a))

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10',
               ...
               '2018-05-14', '2018-05-15', '2018-05-16', '2018-05-17',
               '2018-05-18', '2018-05-19', '2018-05-20', '2018-05-21',
               '2018-05-22', '2018-05-23'],
              dtype='datetime64[ns]', length=143, freq='D')


<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


# Pandas dataframe

In [37]:
import numpy as np

temp = np.random.randint(low = 20, high =100, size = [20,])
name = np.random.choice(['Abhay','Teclov','Geekshub','Ankit'],20)
random = np.random.choice([10,11,13,12,14],20)

a = list(zip(temp, name, random))

df = pd.DataFrame(data = a, columns=['temp','name','random'])
print(df)
print("\n")
print(type(df))

    temp      name  random
0     60     Abhay      13
1     79     Abhay      14
2     77     Ankit      11
3     86     Ankit      14
4     79  Geekshub      13
5     28  Geekshub      14
6     56  Geekshub      11
7     98     Abhay      13
8     28  Geekshub      10
9     91    Teclov      11
10    45    Teclov      10
11    43  Geekshub      12
12    50  Geekshub      13
13    83    Teclov      14
14    39     Abhay      13
15    41    Teclov      14
16    56    Teclov      11
17    45    Teclov      13
18    67    Teclov      12
19    41     Ankit      12


<class 'pandas.core.frame.DataFrame'>


In [38]:
temp = np.random.randint(low = 20, high =100, size = [20,])
name = np.random.choice(['Abhay','Teclov','Geekshub','Ankit'],20)
random = np.random.choice([10,11,13,12,14],20)

df = pd.DataFrame({'temp':temp, 'name':name, 'random':random})
print(type(df))
print("\n")
print(df.head())
print("\n")
print(print(df.tail()))

<class 'pandas.core.frame.DataFrame'>


   temp      name  random
0    75     Abhay      13
1    45     Abhay      10
2    21     Abhay      14
3    89  Geekshub      12
4    79     Abhay      14


    temp      name  random
15    96     Ankit      13
16    61    Teclov      11
17    60     Ankit      10
18    42  Geekshub      11
19    91     Ankit      10
None


In [39]:
print(df.shape)
print("\n")
print(df.columns)

(20, 3)


Index(['temp', 'name', 'random'], dtype='object')


In [40]:
df.name

0        Abhay
1        Abhay
2        Abhay
3     Geekshub
4        Abhay
5        Ankit
6     Geekshub
7        Ankit
8       Teclov
9        Ankit
10    Geekshub
11       Ankit
12      Teclov
13       Abhay
14       Ankit
15       Ankit
16      Teclov
17       Ankit
18    Geekshub
19       Ankit
Name: name, dtype: object

In [41]:
df['name']

0        Abhay
1        Abhay
2        Abhay
3     Geekshub
4        Abhay
5        Ankit
6     Geekshub
7        Ankit
8       Teclov
9        Ankit
10    Geekshub
11       Ankit
12      Teclov
13       Abhay
14       Ankit
15       Ankit
16      Teclov
17       Ankit
18    Geekshub
19       Ankit
Name: name, dtype: object

In [42]:
df['temp'].describe()

count    20.000000
mean     60.200000
std      24.483292
min      20.000000
25%      41.750000
50%      63.500000
75%      78.250000
max      96.000000
Name: temp, dtype: float64

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
temp      20 non-null int32
name      20 non-null object
random    20 non-null int32
dtypes: int32(2), object(1)
memory usage: 400.0+ bytes


In [44]:
df.values


array([[75, 'Abhay', 13],
       [45, 'Abhay', 10],
       [21, 'Abhay', 14],
       [89, 'Geekshub', 12],
       [79, 'Abhay', 14],
       [66, 'Ankit', 14],
       [92, 'Geekshub', 13],
       [78, 'Ankit', 10],
       [68, 'Teclov', 12],
       [41, 'Ankit', 11],
       [44, 'Geekshub', 12],
       [29, 'Ankit', 13],
       [20, 'Teclov', 12],
       [32, 'Abhay', 11],
       [75, 'Ankit', 12],
       [96, 'Ankit', 13],
       [61, 'Teclov', 11],
       [60, 'Ankit', 10],
       [42, 'Geekshub', 11],
       [91, 'Ankit', 10]], dtype=object)

In [45]:
print(df)

    temp      name  random
0     75     Abhay      13
1     45     Abhay      10
2     21     Abhay      14
3     89  Geekshub      12
4     79     Abhay      14
5     66     Ankit      14
6     92  Geekshub      13
7     78     Ankit      10
8     68    Teclov      12
9     41     Ankit      11
10    44  Geekshub      12
11    29     Ankit      13
12    20    Teclov      12
13    32     Abhay      11
14    75     Ankit      12
15    96     Ankit      13
16    61    Teclov      11
17    60     Ankit      10
18    42  Geekshub      11
19    91     Ankit      10


In [46]:
df.sort_index(axis =0, ascending=False)

Unnamed: 0,temp,name,random
19,91,Ankit,10
18,42,Geekshub,11
17,60,Ankit,10
16,61,Teclov,11
15,96,Ankit,13
14,75,Ankit,12
13,32,Abhay,11
12,20,Teclov,12
11,29,Ankit,13
10,44,Geekshub,12


In [47]:
df.drop(['random'], axis =1)

Unnamed: 0,temp,name
0,75,Abhay
1,45,Abhay
2,21,Abhay
3,89,Geekshub
4,79,Abhay
5,66,Ankit
6,92,Geekshub
7,78,Ankit
8,68,Teclov
9,41,Ankit


In [48]:
df.head()

Unnamed: 0,temp,name,random
0,75,Abhay,13
1,45,Abhay,10
2,21,Abhay,14
3,89,Geekshub,12
4,79,Abhay,14


In [49]:
df.iloc[[0,1]]

Unnamed: 0,temp,name,random
0,75,Abhay,13
1,45,Abhay,10


In [50]:
df.iloc[1:3,1]

1    Abhay
2    Abhay
Name: name, dtype: object

In [51]:
df.iloc[[True,True,False,True]]

Unnamed: 0,temp,name,random
0,75,Abhay,13
1,45,Abhay,10
3,89,Geekshub,12


In [52]:
df.head()

Unnamed: 0,temp,name,random
0,75,Abhay,13
1,45,Abhay,10
2,21,Abhay,14
3,89,Geekshub,12
4,79,Abhay,14


In [54]:
df.loc[9,:]

temp         41
name      Ankit
random       11
Name: 9, dtype: object

In [56]:
df.loc[[3,8,4]]

Unnamed: 0,temp,name,random
3,89,Geekshub,12
8,68,Teclov,12
4,79,Abhay,14


In [57]:
df.loc[[3,4],'name':'random']

Unnamed: 0,name,random
3,Geekshub,12
4,Abhay,14


In [58]:
df.loc[[True, True, False, True]]

Unnamed: 0,temp,name,random
0,75,Abhay,13
1,45,Abhay,10
3,89,Geekshub,12


In [59]:
df.loc[df.random > 13]

Unnamed: 0,temp,name,random
2,21,Abhay,14
4,79,Abhay,14
5,66,Ankit,14


In [60]:
df.loc[(df.random > 13) | (df.random == 10),:]

Unnamed: 0,temp,name,random
1,45,Abhay,10
2,21,Abhay,14
4,79,Abhay,14
5,66,Ankit,14
7,78,Ankit,10
17,60,Ankit,10
19,91,Ankit,10


In [61]:
# Merging & concat
d1 = pd.DataFrame([['a', 1], ['b', 2]],columns=['col1', 'number'])
d2 = pd.DataFrame([['c', 3, 'lion'], ['d', 4, 'tiger']],columns=['letter', 'number', 'animal'])

In [63]:
print(d1)
print("\n")
print(d2)

  col1  number
0    a       1
1    b       2


  letter  number animal
0      c       3   lion
1      d       4  tiger


In [64]:
pd.concat([d1,d2],axis =0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,animal,col1,letter,number
0,,a,,1
1,,b,,2
0,lion,,c,3
1,tiger,,d,4


In [65]:
pd.concat([d1,d2], axis =0, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,animal,col1,letter,number
0,,a,,1
1,,b,,2
2,lion,,c,3
3,tiger,,d,4


In [66]:
pd.concat([d1,d2], axis = 1)

Unnamed: 0,col1,number,letter,number.1,animal
0,a,1,c,3,lion
1,b,2,d,4,tiger


In [68]:
d1 = pd.DataFrame({
    "city" : ["lucknow","kanpur","agra","delhi"],
    "temperature" : [32,45,30,40]
})

print(d1)

      city  temperature
0  lucknow           32
1   kanpur           45
2     agra           30
3    delhi           40


In [69]:
d2 = pd.DataFrame({
    "city" : ["delhi","lucknow","kanpur"],
    "humidity" : [68,65,75]
})

print(d2)

      city  humidity
0    delhi        68
1  lucknow        65
2   kanpur        75


In [70]:
df = pd.merge(d1,d2, on='city')
print(df)

      city  temperature  humidity
0  lucknow           32        65
1   kanpur           45        75
2    delhi           40        68


In [71]:
df1=pd.merge(d1,d2, on=['city'], how ='outer')
print(df1)

      city  temperature  humidity
0  lucknow           32      65.0
1   kanpur           45      75.0
2     agra           30       NaN
3    delhi           40      68.0


In [72]:
pd.merge(d1, d2, on =['city'], how='left')

Unnamed: 0,city,temperature,humidity
0,lucknow,32,65.0
1,kanpur,45,75.0
2,agra,30,
3,delhi,40,68.0


In [73]:
# dataset from https://github.com/codebasics/py/blob/master/pandas/6_handling_missing_data_replace/weather_data.csv