## Pandas Basics

- Pandas is an open source data analysis library written in python.
- It leverage power and speed of Numpy to make data analysis and preprocessing easy for data scientist.
- It provides rich and highly robust data operation.

In [1]:
import numpy as np
import pandas as pd

In [2]:
dict1 = {
    "name": ["Rohit" , "Rohan", "Shivam", "Rahul"],
    "marks": [90, 80, 70, 60],
    "city": ["Delhi", "Mumbai", "Chennai", "Kolkata"]
}

In [3]:
df = pd.DataFrame(dict1)
df

Unnamed: 0,name,marks,city
0,Rohit,90,Delhi
1,Rohan,80,Mumbai
2,Shivam,70,Chennai
3,Rahul,60,Kolkata


In [4]:
## Exporting data
df.to_csv("test_export_data.csv")
df.to_csv("test_export_data_index_false.csv", index=False)

In [5]:
## Show top rows
df.head(2)

Unnamed: 0,name,marks,city
0,Rohit,90,Delhi
1,Rohan,80,Mumbai


In [6]:
## Show bottom rows
df.tail(2)

Unnamed: 0,name,marks,city
2,Shivam,70,Chennai
3,Rahul,60,Kolkata


In [7]:
## Calculate statistics for numeric columns
df.describe()

Unnamed: 0,marks
count,4.0
mean,75.0
std,12.909944
min,60.0
25%,67.5
50%,75.0
75%,82.5
max,90.0


## Data Structures
#### There are two data structures in pandas.
- ### Series
    - Series are One Dimensional DataStructure
- ### Dataframe
    - Dataframe are Two Dimensional DataStructure

### Series

In [8]:
ser = pd.Series(np.random.rand(34))
type(ser) # pandas.core.series.Series
ser

0     0.554235
1     0.151746
2     0.951121
3     0.971143
4     0.982592
5     0.238297
6     0.001768
7     0.213459
8     0.964155
9     0.989071
10    0.482846
11    0.828251
12    0.937719
13    0.537490
14    0.716414
15    0.803144
16    0.456667
17    0.880145
18    0.553513
19    0.975251
20    0.964290
21    0.347442
22    0.241811
23    0.502474
24    0.991426
25    0.438290
26    0.984663
27    0.789856
28    0.188762
29    0.915214
30    0.606336
31    0.475068
32    0.422394
33    0.033884
dtype: float64

### DataFrame

In [9]:
df = pd.DataFrame(np.random.rand(334, 5), index = np.arange(334))
df

Unnamed: 0,0,1,2,3,4
0,0.542409,0.739883,0.645308,0.866934,0.138324
1,0.058592,0.788724,0.448219,0.780150,0.694355
2,0.677126,0.886942,0.745726,0.303671,0.383139
3,0.748780,0.660499,0.574413,0.149298,0.088586
4,0.300162,0.014201,0.713316,0.916894,0.264554
...,...,...,...,...,...
329,0.677180,0.099890,0.058242,0.120966,0.185845
330,0.962762,0.102247,0.505363,0.156890,0.366747
331,0.275290,0.358608,0.900146,0.445781,0.659336
332,0.786493,0.147515,0.211548,0.685722,0.366693


In [10]:
df.columns = ["A", "B", "C", "D", "E"]
df

Unnamed: 0,A,B,C,D,E
0,0.542409,0.739883,0.645308,0.866934,0.138324
1,0.058592,0.788724,0.448219,0.780150,0.694355
2,0.677126,0.886942,0.745726,0.303671,0.383139
3,0.748780,0.660499,0.574413,0.149298,0.088586
4,0.300162,0.014201,0.713316,0.916894,0.264554
...,...,...,...,...,...
329,0.677180,0.099890,0.058242,0.120966,0.185845
330,0.962762,0.102247,0.505363,0.156890,0.366747
331,0.275290,0.358608,0.900146,0.445781,0.659336
332,0.786493,0.147515,0.211548,0.685722,0.366693


In [11]:
## Accessing rows
df.loc[3]

A    0.748780
B    0.660499
C    0.574413
D    0.149298
E    0.088586
Name: 3, dtype: float64

In [12]:
## Accessing columns
df["A"]

0      0.542409
1      0.058592
2      0.677126
3      0.748780
4      0.300162
         ...   
329    0.677180
330    0.962762
331    0.275290
332    0.786493
333    0.561040
Name: A, Length: 334, dtype: float64

In [13]:
## Accessing rows and columns with custom index
df.loc[[1, 2, 3], ["A", "B"]]

Unnamed: 0,A,B
1,0.058592,0.788724
2,0.677126,0.886942
3,0.74878,0.660499


In [14]:

df.loc[[1,2], :]

Unnamed: 0,A,B,C,D,E
1,0.058592,0.788724,0.448219,0.78015,0.694355
2,0.677126,0.886942,0.745726,0.303671,0.383139


In [15]:
df.loc[(df["A"] > 0.5)]

Unnamed: 0,A,B,C,D,E
0,0.542409,0.739883,0.645308,0.866934,0.138324
2,0.677126,0.886942,0.745726,0.303671,0.383139
3,0.748780,0.660499,0.574413,0.149298,0.088586
6,0.920416,0.290432,0.735729,0.307730,0.905701
8,0.513918,0.124224,0.341602,0.529404,0.314646
...,...,...,...,...,...
328,0.773441,0.055141,0.113252,0.305231,0.562487
329,0.677180,0.099890,0.058242,0.120966,0.185845
330,0.962762,0.102247,0.505363,0.156890,0.366747
332,0.786493,0.147515,0.211548,0.685722,0.366693


In [16]:
df.loc[df["A"] > 0.5, ["A"]]

Unnamed: 0,A
0,0.542409
2,0.677126
3,0.748780
6,0.920416
8,0.513918
...,...
328,0.773441
329,0.677180
330,0.962762
332,0.786493


In [26]:
df.loc(df["A"])

TypeError: unhashable type: 'Series'