# Pandas
### Pandas is defined as an open-source library that provides high-performance data manipulation in Python. 
#### The name of Pandas is derived from the word Panel Data, which means an Econometrics from Multidimensional data. 
#### It is used for data analysis in Python and developed by Wes McKinney in 2008.

### we prefer Pandas because working with Pandas is fast, simple and more expressive than other tools.

### Pandas is built on top of the Numpy package, means Numpy is required for operating the Pandas.

### Benefits of Pandas

#### Data Representation: It represents the data in a form that is suited for data analysis through its DataFrame and Series.
#### Clear code: The clear API of the Pandas allows you to focus on the core part of the code. So, it provides clear and concise code for the user.

## Python Pandas Data Structure
#series   - 1d
#Dataframe -2d
#panel - 3d
### Series - It is defined as a one-dimensional array that is capable of storing various data types.
#### The row labels of series are called the index. We can easily convert the list, tuple, and dictionary into series using "series' method. 
### A Series cannot contain multiple columns. It has one parameter:

In [1]:
import pandas as pd  
import numpy as np  
info = np.array(['P','a','n','d','a','s'])   #ndarray
a = pd.Series(info)  
 
print(info)

print(a)

['P' 'a' 'n' 'd' 'a' 's']
0    P
1    a
2    n
3    d
4    a
5    s
dtype: object


In [19]:
d = {'a' : 0, 'b' : 1, 'c' : 2} #dict
print(d)
a=pd.Series(d)
print(a)
type(a)

{'a': 0, 'b': 1, 'c': 2}
a    0
b    1
c    2
dtype: int64


pandas.core.series.Series

In [7]:
a.index

Index(['a', 'b', 'c'], dtype='object')

In [15]:
print(a.values) 

[0. 1. 2.]


In [8]:
pd.Series(5, index=['a', 'b', 'c', 'd', 'e'])  #scalar

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [21]:
s = pd.Series(np.random.randn(5), index=['python', 'b', 'c', 'd', 'e'])
s

python    2.872840
b        -0.983118
c        -2.551905
d         0.818367
e         1.236409
dtype: float64

In [3]:
print("type of data",type(s))

type of data <class 'pandas.core.series.Series'>


In [4]:
print(s.shape) 
print(s.ndim)   
print(s.size)   
print(s.nbytes)  #5*8

(5,)
1
5
40


In [22]:
# Indexing and slicing
print(s)
s[0]

python    2.872840
b        -0.983118
c        -2.551905
d         0.818367
e         1.236409
dtype: float64


2.872840230648128

In [23]:
s[:3]#0:3

python    2.872840
b        -0.983118
c        -2.551905
dtype: float64

In [25]:
print( s.median())
s[s > s.median()]  #condition inside is  true 

0.8183666835030668


python    2.872840
e         1.236409
dtype: float64

In [27]:
s > s.median()

python     True
b         False
c         False
d         False
e          True
dtype: bool

In [28]:
s[[2, 3, 1]]

e    1.236409
d    0.818367
b   -0.983118
dtype: float64

In [29]:
s + s

python    5.745680
b        -1.966235
c        -5.103811
d         1.636733
e         2.472819
dtype: float64

In [30]:
s * 2

python    5.745680
b        -1.966235
c        -5.103811
d         1.636733
e         2.472819
dtype: float64

In [32]:
print(s)
np.exp(s)

python    2.872840
b        -0.983118
c        -2.551905
d         0.818367
e         1.236409
dtype: float64


python    17.687183
b          0.374143
c          0.077933
d          2.266794
e          3.443228
dtype: float64

In [27]:
s[1:] + s[:-1]

a         NaN
b    1.140608
c   -1.263471
d   -1.230120
e         NaN
dtype: float64

In [33]:
s['e'] = 12 #update
s

python     2.872840
b         -0.983118
c         -2.551905
d          0.818367
e         12.000000
dtype: float64

In [33]:
'e' in s

True

In [34]:
'f' in s

False

In [4]:
s = pd.Series(np.random.random(5), name='random series')
s

0    0.172345
1    0.778102
2    0.340674
3    0.380302
4    0.085255
Name: random series, dtype: float64

In [36]:
s.name

'random series'

In [13]:
# Checking Emptiness and Presence of NaNs
import numpy as np   
import pandas as pd   
a=pd.Series(data=[1,2,3,np.NaN])   
b=pd.Series(data=[4.9,8.2,5.6],index=['x','y','z'])   
c=pd.Series()  
print(a)
print(b)
print(c)
print(a.empty,b.empty,c.empty)   
print(a.hasnans,b.hasnans,c.hasnans)   
print(len(a),len(b))   
print(a.count( ),b.count( ))  

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64
x    4.9
y    8.2
z    5.6
dtype: float64
Series([], dtype: float64)
False False True
True False False
4 3
3 3


  


### DataFrame - It is a widely used data structure of pandas and works with a two-dimensional array with labeled axes (rows and columns).
### DataFrame is defined as a standard way to store data and has two different indexes, i.e., row index and column index.
#### We can perform basic operations on rows/columns like selecting, deleting, adding, and renaming.

In [4]:
import pandas as pd  
# a list of strings  
x = ['Python', 'Pandas']  
  
# Calling DataFrame constructor on list  
df = pd.DataFrame(x)  
print(df)  

        0
0  Python
1  Pandas


In [12]:
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [5]:
#print(d)
pd.DataFrame([[1,2],[5,6]], index=['a', 'b'],columns=['x','y'])


Unnamed: 0,x,y
a,1,2
b,5,6


In [38]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
      'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}    #ndarray,dict and series
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [40]:
# Import pandas package
import pandas as pd
 
# Define a dictionary containing employee data
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']}
print(data)
# Convert the dictionary into DataFrame 
df = pd.DataFrame(data)
print(df)
# select two columns
print(df[['Name', 'Qualification']])

{'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'], 'Age': [27, 24, 22, 32], 'Address': ['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'], 'Qualification': ['Msc', 'MA', 'MCA', 'Phd']}
     Name  Age    Address Qualification
0     Jai   27      Delhi           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannauj           Phd
     Name Qualification
0     Jai           Msc
1  Princi            MA
2  Gaurav           MCA
3    Anuj           Phd


In [41]:
print ("Delete the first column:")  
del df['Address']  
print (df) 

Delete the first column:
     Name  Age Qualification
0     Jai   27           Msc
1  Princi   24            MA
2  Gaurav   22           MCA
3    Anuj   32           Phd


In [47]:
df["adress"] = ["delhi" , "pune" , "goa",1 ]
print(df)

     Name  Age Qualification adress
0     Jai   27           Msc  delhi
1  Princi  700            MA   pune
2  Gaurav   22           MCA    goa
3    Anuj   32           Phd      1


In [42]:
#dataframe[col][row]=value  ----------- MODIFY VALUE
df['Age'][1]=700
print(df)

     Name  Age Qualification
0     Jai   27           Msc
1  Princi  700            MA
2  Gaurav   22           MCA
3    Anuj   32           Phd


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [57]:
df_stockprice=pd.DataFrame([[100,200,300,400,500],[10,20,30,40,50]],index=['SBI','HDFC'],columns=['12-may','13-may','14-may','15-may','16-may'])
print(df_stockprice)

      12-may  13-may  14-may  15-may  16-may
SBI      100     200     300     400     500
HDFC      10      20      30      40      50


In [2]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20210518'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"],dtype='category'),
                    'F': 'foo', })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-05-18,1.0,3,test,foo
1,1.0,2021-05-18,1.0,3,train,foo
2,1.0,2021-05-18,1.0,3,test,foo
3,1.0,2021-05-18,1.0,3,train,foo


In [3]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [3]:
dates = pd.date_range('20211206', periods=6 ,freq = "D")
dates

DatetimeIndex(['2021-12-06', '2021-12-07', '2021-12-08', '2021-12-09',
               '2021-12-10', '2021-12-11'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-12-06,-0.478379,-0.820828,-1.593462,0.130491
2021-12-07,1.173384,-0.346476,-0.649514,-0.547317
2021-12-08,-0.396688,0.646133,0.028811,-1.578188
2021-12-09,-0.943732,-0.020006,-0.211147,-0.134739
2021-12-10,0.065391,1.031294,2.350166,-0.603419
2021-12-11,-0.936939,1.414082,1.020258,-0.748943


In [23]:
df1 = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
0,-2.333722,0.964992,-0.256514,1.139047
1,1.658904,-0.322326,0.054671,-1.710104
2,-0.578645,0.242718,0.717806,1.485781
3,0.226544,-1.32964,2.097801,-0.586092
4,0.008865,0.652949,0.942479,-0.730933
5,0.109681,1.55936,-1.349067,2.02103


In [28]:
df1.head(2) #by defalut 5

Unnamed: 0,A,B,C,D
0,-2.333722,0.964992,-0.256514,1.139047
1,1.658904,-0.322326,0.054671,-1.710104


In [29]:
df1.tail(4)

Unnamed: 0,A,B,C,D
2,-0.578645,0.242718,0.717806,1.485781
3,0.226544,-1.32964,2.097801,-0.586092
4,0.008865,0.652949,0.942479,-0.730933
5,0.109681,1.55936,-1.349067,2.02103


In [30]:
df1.to_numpy() #does not include the index or column labels in the output.

array([[-2.33372165,  0.96499222, -0.25651419,  1.13904707],
       [ 1.65890382, -0.3223259 ,  0.05467065, -1.71010352],
       [-0.57864524,  0.24271814,  0.71780571,  1.48578104],
       [ 0.22654442, -1.32963999,  2.09780078, -0.58609159],
       [ 0.00886541,  0.65294949,  0.9424788 , -0.73093349],
       [ 0.10968137,  1.55935961, -1.34906717,  2.02103032]])

In [15]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.92092,0.084915,-0.064632,0.13418
std,0.407138,1.491234,1.26819,0.703688
min,0.531892,-2.686462,-1.823247,-1.142609
25%,0.656675,-0.137308,-0.83165,0.07832
50%,0.745978,0.49344,-0.083229,0.162281
75%,1.198194,1.106588,0.811905,0.595247
max,1.525834,1.245662,1.572756,0.833226


In [31]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 320.0 bytes


In [32]:
df1.T #Transposing your data:

Unnamed: 0,0,1,2,3,4,5
A,-2.333722,1.658904,-0.578645,0.226544,0.008865,0.109681
B,0.964992,-0.322326,0.242718,-1.32964,0.652949,1.55936
C,-0.256514,0.054671,0.717806,2.097801,0.942479,-1.349067
D,1.139047,-1.710104,1.485781,-0.586092,-0.730933,2.02103


In [34]:
df1.sort_index(axis=0, ascending=True)  #0 index , 1 coloum

Unnamed: 0,A,B,C,D
0,-2.333722,0.964992,-0.256514,1.139047
1,1.658904,-0.322326,0.054671,-1.710104
2,-0.578645,0.242718,0.717806,1.485781
3,0.226544,-1.32964,2.097801,-0.586092
4,0.008865,0.652949,0.942479,-0.730933
5,0.109681,1.55936,-1.349067,2.02103


In [66]:
df['A'] #Selecting a single column

2021-04-19    0.068784
2021-04-20    0.352483
2021-04-21   -0.253618
2021-04-22    0.002245
2021-04-23   -2.069750
2021-04-24   -0.302213
Freq: D, Name: A, dtype: float64

In [16]:
df['2021-04-24']

KeyError: '2021-04-24'

In [17]:
df1[1]

KeyError: 1

### Operation       Syntax      Result

#### Select column   df[col]     Series

#### Select row by label    df.loc[label]    Series

#### Select row by integer location    df.iloc[loc]    Series

#### Slice rows    df[5:10]     DataFrame

Unnamed: 0,A,B,C,D
2021-12-06,2.473275,1.055694,-0.171882,-0.008818
2021-12-07,0.552059,-0.768153,0.8439,-0.137757
2021-12-08,-0.259331,-0.677748,-1.064981,-1.244535
2021-12-09,-0.307154,-0.002134,-0.283371,0.535061
2021-12-10,0.652361,0.078731,1.006721,-0.68018
2021-12-11,0.042524,0.128895,-0.245139,-0.73286


In [63]:
df['A']

2021-12-06    2.473275
2021-12-07    0.552059
2021-12-08   -0.259331
2021-12-09   -0.307154
2021-12-10    0.652361
2021-12-11    0.042524
Freq: D, Name: A, dtype: float64

In [57]:
a = dates[1]
print(a)
df.loc[a] #Selection by label

2021-12-07 00:00:00


A    0.552059
B   -0.768153
C    0.843900
D   -0.137757
Name: 2021-12-07 00:00:00, dtype: float64

In [58]:
df.loc['2021-12-07', ['A', 'B']]

A    0.552059
B   -0.768153
Name: 2021-12-07 00:00:00, dtype: float64

In [71]:
df.iloc[3]

A    0.002245
B   -1.368907
C    0.278998
D    0.434464
Name: 2021-04-22 00:00:00, dtype: float64

In [64]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2021-12-09,-0.307154,-0.002134
2021-12-10,0.652361,0.078731


In [18]:
df['E'] = [1,2,np.nan,np.nan,5,6]
df

Unnamed: 0,A,B,C,D,E
2021-12-06,-0.478379,-0.820828,-1.593462,0.130491,1.0
2021-12-07,1.173384,-0.346476,-0.649514,-0.547317,2.0
2021-12-08,-0.396688,0.646133,0.028811,-1.578188,
2021-12-09,-0.943732,-0.020006,-0.211147,-0.134739,
2021-12-10,0.065391,1.031294,2.350166,-0.603419,5.0
2021-12-11,-0.936939,1.414082,1.020258,-0.748943,6.0


In [20]:
df.dropna(axis = 1)

Unnamed: 0,A,B,C,D
2021-12-06,-0.478379,-0.820828,-1.593462,0.130491
2021-12-07,1.173384,-0.346476,-0.649514,-0.547317
2021-12-08,-0.396688,0.646133,0.028811,-1.578188
2021-12-09,-0.943732,-0.020006,-0.211147,-0.134739
2021-12-10,0.065391,1.031294,2.350166,-0.603419
2021-12-11,-0.936939,1.414082,1.020258,-0.748943


In [22]:
a= df.dropna(how='any' ,axis = 0) #axis=0,1
a

Unnamed: 0,A,B,C,D,E
2021-12-06,-0.478379,-0.820828,-1.593462,0.130491,1.0
2021-12-07,1.173384,-0.346476,-0.649514,-0.547317,2.0
2021-12-10,0.065391,1.031294,2.350166,-0.603419,5.0
2021-12-11,-0.936939,1.414082,1.020258,-0.748943,6.0


In [57]:
df/

Unnamed: 0,A,B,C,D,E
2021-04-19,0.379904,0.703379,1.773696,-0.94118,1.0
2021-04-20,0.002938,0.117171,-2.466678,-1.035229,2.0
2021-04-21,0.443926,-0.48455,-0.564965,-0.502286,10000.0
2021-04-22,-0.813801,0.983425,0.680818,-1.586598,10000.0
2021-04-23,1.192702,0.430739,0.686013,-0.051905,5.0
2021-04-24,-0.573396,-1.61669,-0.745421,-1.437847,6.0


In [133]:
dir(df)

['A',
 'B',
 'C',
 'D',
 'E',
 'T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__