Pandas DataFrame and Series

Pandas is a powerful data manipulation library, widely used for data analysis and data cleaning. It provides 2 primary data structures: Series and Data-Frame. Series is a one dimentional array like object, while a data frame is a 2 dimentional,size-mutable and potentially heterogeneous tabular data structure with labeled axes(rows and columns) 

In [37]:
import pandas as pd


In [38]:
!pip install pandas



In [39]:
## Series is a 1D array of object that can hold any data type.It is similar to a column in a table
## Series can have a single column.
## create series from list
data=[1,2,3,4,5]
series=pd.Series(data)
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [40]:
## create series from dictionary

series_dict={
    'a':1,
    'b':2,
    'c':3
}

data=pd.Series(series_dict)
print(data)

a    1
b    2
c    3
dtype: int64


In [41]:
## we can set custom keys
import pandas as pd

data=[1,2,3]
index=['b','c','d']

pd.Series(data,index=index)

b    1
c    2
d    3
dtype: int64

In [42]:
## DataFrame: can have a multiple rows and columns.

## create a dataframe from a dictionary of list

dic={
    'Name':['Mubtasim','Tousif','Nuhan'],
    'Age':[23,45,56],
    'City':['Dhaka','Sylhet','Rangpur']
}

df=pd.DataFrame(dic)
print(df)

       Name  Age     City
0  Mubtasim   23    Dhaka
1    Tousif   45   Sylhet
2     Nuhan   56  Rangpur


In [43]:
import numpy as np

np.array(df)

array([['Mubtasim', 23, 'Dhaka'],
       ['Tousif', 45, 'Sylhet'],
       ['Nuhan', 56, 'Rangpur']], dtype=object)

In [44]:
## create a dataframe from a list of dictionaries

lst=[
    {'Name':'Nuhan','Age':25,'City':'Dhaka'},
    {'Name':'Tousif','Age':23,'City':'Sylhet'},
    {'Name':'Gazi','Age':22,'City':'Dhaka'}
]

df=pd.DataFrame(lst)
print(df)

     Name  Age    City
0   Nuhan   25   Dhaka
1  Tousif   23  Sylhet
2    Gazi   22   Dhaka


In [45]:
dfs = pd.read_csv('sales_data_sample.csv', encoding='ISO-8859-1')
dfs.head(5)


Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [46]:
print(df['Name'])
type(df['Name'])

0     Nuhan
1    Tousif
2      Gazi
Name: Name, dtype: object


pandas.core.series.Series

In [47]:
df.loc[0]

Name    Nuhan
Age        25
City    Dhaka
Name: 0, dtype: object

In [48]:
df.loc[0:1]

Unnamed: 0,Name,Age,City
0,Nuhan,25,Dhaka
1,Tousif,23,Sylhet


In [49]:
df.iloc[0]

Name    Nuhan
Age        25
City    Dhaka
Name: 0, dtype: object

In [50]:
df.iloc[0][0]

  df.iloc[0][0]


'Nuhan'

In [51]:
df.iloc[1][0]

  df.iloc[1][0]


'Tousif'

In [52]:

df

Unnamed: 0,Name,Age,City
0,Nuhan,25,Dhaka
1,Tousif,23,Sylhet
2,Gazi,22,Dhaka


In [53]:
## Accessing a specified element using at
df.at[1,'City']

'Sylhet'

In [54]:
## Accessing a specified element using iat

df.iat[2,2]  ## here we are giving both row and column index

'Dhaka'

In [55]:
df

Unnamed: 0,Name,Age,City
0,Nuhan,25,Dhaka
1,Tousif,23,Sylhet
2,Gazi,22,Dhaka


In [56]:
## Data Manupulation and data frame
## adding a column
df['Salary']=[5000,6000,7000]
df

Unnamed: 0,Name,Age,City,Salary
0,Nuhan,25,Dhaka,5000
1,Tousif,23,Sylhet,6000
2,Gazi,22,Dhaka,7000


In [57]:
## Remove a column

df.drop('Salary',axis=0)  ## By default the axis is 0 and it means it is checking row wiseindex and row index doesnt have Salary

KeyError: "['Salary'] not found in axis"

In [None]:
df.drop('Salary',axis=1)

Unnamed: 0,Name,Age,City
0,Nuhan,25,Dhaka
1,Tousif,23,Sylhet
2,Gazi,22,Dhaka


In [None]:
## But Salary is not permanently dropped. Because it doesnt saves the updated state.
df

Unnamed: 0,Name,Age,City,Salary
0,Nuhan,25,Dhaka,5000
1,Tousif,23,Sylhet,6000
2,Gazi,22,Dhaka,7000


In [None]:
## So what we need to do is:

df.drop('Salary',axis=1,inplace=True)

In [None]:
## Now Salary is permanently removes
df     

Unnamed: 0,Name,Age,City
0,Nuhan,25,Dhaka
1,Tousif,23,Sylhet
2,Gazi,22,Dhaka


In [None]:
## Increment age by 1

df['Age']=df['Age']+1

In [None]:
df


Unnamed: 0,Name,Age,City
0,Nuhan,26,Dhaka
1,Tousif,24,Sylhet
2,Gazi,23,Dhaka


In [None]:
df.drop(0)

Unnamed: 0,Name,Age,City
1,Tousif,24,Sylhet
2,Gazi,23,Dhaka


In [None]:
## it is not removed permanently
df


Unnamed: 0,Name,Age,City
0,Nuhan,26,Dhaka
1,Tousif,24,Sylhet
2,Gazi,23,Dhaka


In [58]:
## Now it will be removed

df.drop(0,inplace=True)

In [59]:
df

Unnamed: 0,Name,Age,City,Salary
1,Tousif,23,Sylhet,6000
2,Gazi,22,Dhaka,7000


In [60]:
dfs = pd.read_csv('sales_data_sample.csv', encoding='ISO-8859-1')
dfs.head(5)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [61]:
## Display the data types of each column
print("Data types",df.dtypes)

Data types Name      object
Age        int64
City      object
Salary     int64
dtype: object


In [65]:
df.dtypes

Name      object
Age        int64
City      object
Salary     int64
dtype: object

In [63]:
## Describe the DataFrame

print("Statistical Summary:\n",df.describe())

Statistical Summary:
              Age       Salary
count   2.000000     2.000000
mean   22.500000  6500.000000
std     0.707107   707.106781
min    22.000000  6000.000000
25%    22.250000  6250.000000
50%    22.500000  6500.000000
75%    22.750000  6750.000000
max    23.000000  7000.000000


In [64]:
df.describe()

Unnamed: 0,Age,Salary
count,2.0,2.0
mean,22.5,6500.0
std,0.707107,707.106781
min,22.0,6000.0
25%,22.25,6250.0
50%,22.5,6500.0
75%,22.75,6750.0
max,23.0,7000.0
