# Pandas 

* Pandas is an open-source Python Library providing high-performance data manipulation and analysis tool using its powerful data structures.

* Using Pandas, we can accomplish five typical steps in the processing and analysis of data, regardless of the origin of data — load, prepare, manipulate, model, and analyze.



* Pandas deals with the following three data structures 

    1) Series

    2) DataFrame

    3) Panel
    
## 1) Series
Series is a one-dimensional array like structure with homogeneous data.
    
### Creating Series
A series can be created using various inputs like −

1) Array.

2) Dict.

3) Scalar value or constant

##### installing pandas

In [1]:
!pip install pandas



#### 1) Array

In [2]:
#import the pandas library and aliasing as pd
import pandas as pd
import numpy as np

In [3]:
#Without passing index
#numpy array
arr = np.array(['a','b','c','d'])
print(arr)

['a' 'b' 'c' 'd']


In [4]:
#creating the series using numpy array
s = pd.Series(arr)
print(s)

0    a
1    b
2    c
3    d
dtype: object


In [5]:
#check the type
type(s)

pandas.core.series.Series

In [6]:
#by passing index.
#numpy array
arr = np.array(['a','b','c','d'])

#creating series
s = pd.Series(arr,index=[100,101,102,103])
s

100    a
101    b
102    c
103    d
dtype: object

#### 2) Dictionary
here keys will be our default indexes 

In [7]:
# Ex1)
dict1 = {'a' : 0, 'b' : 1, 'c' : 2}
s = pd.Series(dict1)
s

a    0
b    1
c    2
dtype: int64

In [8]:
type(s)

pandas.core.series.Series

#### Scalar

In [9]:
# Create a Series from Scalar
s = pd.Series('A', index=[0, 1, 2, 3])
s

0    A
1    A
2    A
3    A
dtype: object

### Accessing Data from Series

#### 1) By using the position

In [10]:
# Data in the series can be accessed similar to that in an ndarray.
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [11]:
#retrieve the first element
s[0]

1

In [12]:
# first 3 elements
s[:3]   #[start,stop,step] # remember stop is exclusive

0    1
1    2
2    3
dtype: int64

In [13]:
# last 3 elements
s[-3:]

2    3
3    4
4    5
dtype: int64

#### 2) By using the keys 

In [14]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [15]:
#retrieve a single element
s['a']

1

In [16]:
#retrieve a multiple elements
s[['a','c','d']]

a    1
c    3
d    4
dtype: int64

In [17]:
#if that key is not present it will produce an error
# s['f']

#### to get the indexes

In [18]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

#### to get the values

In [19]:
s.values

array([1, 2, 3, 4, 5], dtype=int64)

## DataFrame
A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns

### Create an Empty DataFrame

In [20]:
# A basic DataFrame, which can be created is an Empty Dataframe.
# import pandas as pd
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


### Create a DataFrame from Dict of lists/ndArray
* All the ndarrays must be of same length. 
* If index is passed, then the length of the index should equal to the length of the arrays.
* If no index is passed, then by default, index will be range(n), where n is the array length.

In [21]:
# Ex1) without index
name = ['Tom', 'Jack', 'Steve', 'Ricky']
age = [22,23,24,25]

# creating dictionary
dict1 = {'Name':name,'Age':age}
# print(dict1)
# passing dictionary to create df
df = pd.DataFrame(dict1)
df

Unnamed: 0,Name,Age
0,Tom,22
1,Jack,23
2,Steve,24
3,Ricky,25


In [22]:
#Ex2) with index
name = ['Tom', 'Jack', 'Steve', 'Ricky']
age = [28,34,29,42]

# my new indexes
indexes = ['rank1','rank2','rank3','rank4']

#this is my dictionary
dict1 = {'Name':name,'Age':age}

#passing dictionary to create dataframe
df = pd.DataFrame(dict1, index=indexes)
df

Unnamed: 0,Name,Age
rank1,Tom,28
rank2,Jack,34
rank3,Steve,29
rank4,Ricky,42


### Column Selection

In [23]:
df['Name']

rank1      Tom
rank2     Jack
rank3    Steve
rank4    Ricky
Name: Name, dtype: object

In [24]:
df.Age

rank1    28
rank2    34
rank3    29
rank4    42
Name: Age, dtype: int64

### Column Addition

In [25]:
df

Unnamed: 0,Name,Age
rank1,Tom,28
rank2,Jack,34
rank3,Steve,29
rank4,Ricky,42


In [26]:
# say suppose you want to add city column 
city = ['Banglore','Pune','Hyderabad','Mumbai']

#adding new column named 'City'
df['City']=city
df

Unnamed: 0,Name,Age,City
rank1,Tom,28,Banglore
rank2,Jack,34,Pune
rank3,Steve,29,Hyderabad
rank4,Ricky,42,Mumbai


### Column Deletion

In [27]:
# 1) Using del keyword
del df['City']
df

Unnamed: 0,Name,Age
rank1,Tom,28
rank2,Jack,34
rank3,Steve,29
rank4,Ricky,42


In [28]:
# 2) Using pop function
df.pop('Age')
df

Unnamed: 0,Name
rank1,Tom
rank2,Jack
rank3,Steve
rank4,Ricky


In [29]:
# 3) drop()
df.drop(['Name'],axis = 1)

rank1
rank2
rank3
rank4


### Some Important functions and properties of pandas

In [30]:
#Example
# import numpy as np
# import pandas as pd

#np arrays
random = np.random.randint(low = 0, high =100, size = 20)
name = np.random.choice(['A', 'B', 'C', 'D', 'E'],size =  20)
choice = np.random.choice([10,11,13,12,14],size = 20)

In [31]:
#creating dictionary
dict1 = {'Random':random, 'Name':name, 'Choice':choice}

In [32]:
dict1

{'Random': array([53, 80, 75, 97, 75, 17, 84, 26, 87, 19, 13,  8, 85, 58,  6,  3, 69,
        36, 91, 87]),
 'Name': array(['B', 'C', 'E', 'A', 'B', 'D', 'A', 'E', 'B', 'D', 'D', 'E', 'A',
        'D', 'C', 'D', 'C', 'B', 'E', 'E'], dtype='<U1'),
 'Choice': array([10, 12, 13, 11, 12, 13, 11, 14, 12, 10, 12, 13, 12, 11, 11, 14, 12,
        14, 14, 11])}

In [33]:
#creating dataframe
df = pd.DataFrame(dict1)
df

Unnamed: 0,Random,Name,Choice
0,53,B,10
1,80,C,12
2,75,E,13
3,97,A,11
4,75,B,12
5,17,D,13
6,84,A,11
7,26,E,14
8,87,B,12
9,19,D,10


In [34]:
# to check the type
type(df)

pandas.core.frame.DataFrame

In [35]:
#to get the dimension or shape
df.shape

(20, 3)

In [36]:
# to get the column names
df.columns

Index(['Random', 'Name', 'Choice'], dtype='object')

In [37]:
#to get the values
df.values

array([[53, 'B', 10],
       [80, 'C', 12],
       [75, 'E', 13],
       [97, 'A', 11],
       [75, 'B', 12],
       [17, 'D', 13],
       [84, 'A', 11],
       [26, 'E', 14],
       [87, 'B', 12],
       [19, 'D', 10],
       [13, 'D', 12],
       [8, 'E', 13],
       [85, 'A', 12],
       [58, 'D', 11],
       [6, 'C', 11],
       [3, 'D', 14],
       [69, 'C', 12],
       [36, 'B', 14],
       [91, 'E', 14],
       [87, 'E', 11]], dtype=object)

In [38]:
#to get the first 5 rows
df.head()

Unnamed: 0,Random,Name,Choice
0,53,B,10
1,80,C,12
2,75,E,13
3,97,A,11
4,75,B,12


In [39]:
# to get first 10 rows 
df.head(10)

Unnamed: 0,Random,Name,Choice
0,53,B,10
1,80,C,12
2,75,E,13
3,97,A,11
4,75,B,12
5,17,D,13
6,84,A,11
7,26,E,14
8,87,B,12
9,19,D,10


In [40]:
# to get the last 5 rows
df.tail()

Unnamed: 0,Random,Name,Choice
15,3,D,14
16,69,C,12
17,36,B,14
18,91,E,14
19,87,E,11


In [41]:
# to get the last 10 rows
df.tail(10)

Unnamed: 0,Random,Name,Choice
10,13,D,12
11,8,E,13
12,85,A,12
13,58,D,11
14,6,C,11
15,3,D,14
16,69,C,12
17,36,B,14
18,91,E,14
19,87,E,11


In [42]:
#to get the information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Random  20 non-null     int32 
 1   Name    20 non-null     object
 2   Choice  20 non-null     int32 
dtypes: int32(2), object(1)
memory usage: 448.0+ bytes


In [43]:
# it will describe only the columns which contains numerical values
df.describe()

Unnamed: 0,Random,Choice
count,20.0,20.0
mean,53.45,12.1
std,33.603689,1.293709
min,3.0,10.0
25%,18.5,11.0
50%,63.5,12.0
75%,84.25,13.0
max,97.0,14.0


In [44]:
# similarly you can check for individual columns
df['Random'].describe()

count    20.000000
mean     53.450000
std      33.603689
min       3.000000
25%      18.500000
50%      63.500000
75%      84.250000
max      97.000000
Name: Random, dtype: float64

### Dealing with csv files
#### Reading CSV files
* syntax ----> pd.read_csv(filename)
* returns a data frame

In [45]:
# Ex1)
marks_df = pd.read_csv('marks.csv')

In [46]:
marks_df

Unnamed: 0,Name,Physics,Chemistry,Maths,Biology
0,A,65,69,87,90
1,B,44,76,57,80
2,C,57,85,79,75
3,D,54,66,71,70
4,E,74,86,75,88
5,F,64,67,77,81


In [47]:
type(marks_df)

pandas.core.frame.DataFrame

In [48]:
#Ex2) Your actual datasets will look something like this
train_data = pd.read_csv('train.csv')

In [49]:
train_data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [50]:
train_data.shape

(4209, 378)

#### writing to csv files
* syntax -------->  to_csv(filename.csv)

In [51]:
df

Unnamed: 0,Random,Name,Choice
0,53,B,10
1,80,C,12
2,75,E,13
3,97,A,11
4,75,B,12
5,17,D,13
6,84,A,11
7,26,E,14
8,87,B,12
9,19,D,10


In [52]:
df.to_csv('file.csv')

In [53]:
df.head()

Unnamed: 0,Random,Name,Choice
0,53,B,10
1,80,C,12
2,75,E,13
3,97,A,11
4,75,B,12


#### Setting a column as index
* syntax ---> df.set_index()
* return new dataframe

In [54]:
df.set_index('Random',inplace = True)

#df = df.set_index('Random')

In [55]:
df

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
53,B,10
80,C,12
75,E,13
97,A,11
75,B,12
17,D,13
84,A,11
26,E,14
87,B,12
19,D,10


#### Sorting by index
* syntax --> df.sort_index()
* returns new dataframe

In [56]:
df.sort_index(ascending=False)

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
97,A,11
91,E,14
87,E,11
87,B,12
85,A,12
84,A,11
80,C,12
75,E,13
75,B,12
69,C,12


#### Sorting by columns
* syntax ----> df.sort_values(by = 'ColumnName')
* returns new dataframe

In [57]:
df.head()

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
53,B,10
80,C,12
75,E,13
97,A,11
75,B,12


In [58]:
df.sort_values(by='Choice', ascending = False)

Unnamed: 0_level_0,Name,Choice
Random,Unnamed: 1_level_1,Unnamed: 2_level_1
91,E,14
36,B,14
26,E,14
3,D,14
75,E,13
17,D,13
8,E,13
13,D,12
75,B,12
87,B,12
