# Pandas

### It is an open source, BSD-licensed library providing high performance, easy-to-use data structure and data analysis tools for python programming
### Pandas is a Python library used for working with data sets.
### It has functions for analyzing, cleaning, exploring, and manipulating data.

## Why to use Pandas:
### Pandas allows us to analyze big data and make conclusions based on statistical theories.
### Pandas can clean messy data sets, and make them readable and relevant.


In [8]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.3.4
1.20.3





## DataFrames   - should have more than 1 row or 1 column 

In [37]:
df=pd.DataFrame(np.arange(0,30).reshape(6,5),index=['Row1','Row2','Row3','Row4','Row5','Row6'],columns=["Col1",'Col2','Col3',"Col4",'Col5'])
print(df)

df.head()      #.head returns first 5 rows of dframe


      Col1  Col2  Col3  Col4  Col5
Row1     0     1     2     3     4
Row2     5     6     7     8     9
Row3    10    11    12    13    14
Row4    15    16    17    18    19
Row5    20    21    22    23    24
Row6    25    26    27    28    29


Unnamed: 0,Col1,Col2,Col3,Col4,Col5
Row1,0,1,2,3,4
Row2,5,6,7,8,9
Row3,10,11,12,13,14
Row4,15,16,17,18,19
Row5,20,21,22,23,24


In [21]:
df=pd.DataFrame(np.arange(10,60).reshape(5,10),index=['Row1','Row2','Row3','Row4','Row5'],columns=["Col1",'Col2','Col3',"Col4",'Col5','Col6','Col7','Col8','Col9','Col10'])
df.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10
Row1,10,11,12,13,14,15,16,17,18,19
Row2,20,21,22,23,24,25,26,27,28,29
Row3,30,31,32,33,34,35,36,37,38,39
Row4,40,41,42,43,44,45,46,47,48,49
Row5,50,51,52,53,54,55,56,57,58,59


In [71]:
print(df['Col3'])
print(type(df['Col3']))     # has only one col so it's series

print(df[['Col3','Col4']])
print(type(df[['Col3','Col4']]))    # has more than 1 col so it's a dataframe


# df['Row3']

SyntaxError: invalid syntax (Temp/ipykernel_13740/3816128637.py, line 7)

In [30]:
# Accessing the elements :- 1).loc  2).iloc (index location)

df.loc['Row2']



Col1     20
Col2     21
Col3     22
Col4     23
Col5     24
Col6     25
Col7     26
Col8     27
Col9     28
Col10    29
Name: Row2, dtype: int32

In [31]:
type(df.loc['Row2'])     # it has only one row so it is a series

pandas.core.series.Series

In [32]:
df.iloc[:,:]    # [Rows,Cols]

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10
Row1,10,11,12,13,14,15,16,17,18,19
Row2,20,21,22,23,24,25,26,27,28,29
Row3,30,31,32,33,34,35,36,37,38,39
Row4,40,41,42,43,44,45,46,47,48,49
Row5,50,51,52,53,54,55,56,57,58,59


In [43]:
print(df.iloc[2:,4:9])
print(type(df.iloc[2:,4:9]) )     # has more than 1 row or col so it is dataframe

print("\n")

print(df.iloc[2:,0:1])            # it has 3 rows and 1 col i.e. more than 1 row thus it is also a dframe
print(type(df.iloc[2:,0:1]))

print("\n")

print(df.iloc[:1,0])              # 1 row or 1 col
print(type(df.iloc[:1,0]))


      Col5  Col6  Col7  Col8  Col9
Row3    34    35    36    37    38
Row4    44    45    46    47    48
Row5    54    55    56    57    58
<class 'pandas.core.frame.DataFrame'>


      Col1
Row3    30
Row4    40
Row5    50
<class 'pandas.core.frame.DataFrame'>


Row1    10
Name: Col1, dtype: int32
<class 'pandas.core.series.Series'>


 ### Converting dataframes into arrays

In [44]:
df.iloc[1:4,2:6].values

array([[22, 23, 24, 25],
       [32, 33, 34, 35],
       [42, 43, 44, 45]])

In [45]:
df.iloc[1:4,2:6].values.shape

(3, 4)

In [46]:
df.iloc[1:4,2:6].values.reshape(2,6)

array([[22, 23, 24, 25, 32, 33],
       [34, 35, 42, 43, 44, 45]])

In [47]:
df.isnull()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10
Row1,False,False,False,False,False,False,False,False,False,False
Row2,False,False,False,False,False,False,False,False,False,False
Row3,False,False,False,False,False,False,False,False,False,False
Row4,False,False,False,False,False,False,False,False,False,False
Row5,False,False,False,False,False,False,False,False,False,False


In [48]:
df.isnull().sum()

Col1     0
Col2     0
Col3     0
Col4     0
Col5     0
Col6     0
Col7     0
Col8     0
Col9     0
Col10    0
dtype: int64

In [60]:
print(df.iloc[1:4,2:6].value_counts())

print("\n")

print(df['Col1'].value_counts())

print("\n")

print(df['Col1'].unique())

Col3  Col4  Col5
7     8     9       1
12    13    14      1
17    18    19      1
dtype: int64


0     1
5     1
10    1
15    1
20    1
25    1
Name: Col1, dtype: int64


[ 0  5 10 15 20 25]


In [1]:
import pandas as pd
mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [5]:
import numpy as np
data=[['BMW',3],['Volvo',7],['Ford',2]]
df=pd.DataFrame(data,columns=['cars','passings'])
print(df)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [8]:
data=[['Tom',20],['Nick',21],['Krish',19],['Jack',18]]
df=pd.DataFrame(data,index=['marks1','marks2','marks3','marks4'] ,columns=['Name','Age'])
print(df)

         Name  Age
marks1    Tom   20
marks2   Nick   21
marks3  Krish   19
marks4   Jack   18


In [14]:
data=[['Tom',],['Nick',21],['Krish',19],['Jack',18]]
df=pd.DataFrame(data,index=['marks1','marks2','marks3','marks4'] ,columns=['Name','Age'])
print(df)

print("\n")

print(df.isnull())

print("\n")
print(df.isnull().sum())

         Name   Age
marks1    Tom   NaN
marks2   Nick  21.0
marks3  Krish  19.0
marks4   Jack  18.0


         Name    Age
marks1  False   True
marks2  False  False
marks3  False  False
marks4  False  False


Name    0
Age     1
dtype: int64


# Series
### DataSeries - can be of 1 row or 1 column

In [17]:
a=[1,2,3,4,5]
ds=pd.Series(a)
print(ds)

print("\n")

print(ds[3])

0    1
1    2
2    3
3    4
4    5
dtype: int64


4


In [22]:
a=[1,2,3,4,5]
ds=pd.Series(a,index=['A','B','C','D','E'])
print(ds)

print("\n")

print(ds['B'])     # will print the value of B

print("\n")

print(ds[3])       # will print value of index number 3

A    1
B    2
C    3
D    4
E    5
dtype: int64


2


4


In [28]:
# dictionary

my_dict={'Ball':34, "Bat":32,"Tennis":45}     # key becomes lable in dictionary case
print(pd.Series(my_dict))

print("\n")
print(pd.Series(my_dict,index=['Ball','Tennis']))

Ball      34
Bat       32
Tennis    45
dtype: int64


Ball      34
Tennis    45
dtype: int64


In [30]:
dic={'Calories':[100,200,300],"Duration":[20,30,40]}
print(pd.DataFrame(dic))


   Calories  Duration
0       100        20
1       200        30
2       300        40


In [None]:
df.to_csv('Test1.csv')    # Will create a comma separated file (file->open->you'll find a Test1.csv file)

print("\n")

df=pd.read_csv('Test1.csv')
print(df.to_string())

In [12]:
import pandas as pd
df=pd.read_csv('mercedesbenz.csv')
print(df)

print("\n")

df.head()

        ID       y  X0 X1  X2 X3 X4  X5 X6 X8  ...  X375  X376  X377  X378  \
0        0  130.81   k  v  at  a  d   u  j  o  ...     0     0     1     0   
1        6   88.53   k  t  av  e  d   y  l  o  ...     1     0     0     0   
2        7   76.26  az  w   n  c  d   x  j  x  ...     0     0     0     0   
3        9   80.62  az  t   n  f  d   x  l  e  ...     0     0     0     0   
4       13   78.02  az  v   n  f  d   h  d  n  ...     0     0     0     0   
...    ...     ...  .. ..  .. .. ..  .. .. ..  ...   ...   ...   ...   ...   
4204  8405  107.39  ak  s  as  c  d  aa  d  q  ...     1     0     0     0   
4205  8406  108.77   j  o   t  d  d  aa  h  h  ...     0     1     0     0   
4206  8412  109.22  ak  v   r  a  d  aa  g  e  ...     0     0     1     0   
4207  8415   87.48  al  r   e  f  d  aa  l  u  ...     0     0     0     0   
4208  8417  110.85   z  r  ae  c  d  aa  g  w  ...     1     0     0     0   

      X379  X380  X382  X383  X384  X385  
0        0     0    

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df.info()     # you'll get information like how many columns are there, data types, integer typpes, object etc

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [15]:
df.describe()   # count,mean,min will be given and only floating/double values will be taken into consideration 

# % - percentile

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
