## Pandas --- 
### Pandas is an open-source data manipulation and analysis library for Python. It provides data structures and functions that make it easy to work with structured data, such as tables and time series, through its main data structure called a DataFrame

In [121]:
# Importing library --- Pandas
import pandas as pd
import numpy as np

### Dataframe --- 

In [122]:
arr1 = np.arange(0,20).reshape(5,4)

In [123]:
arr1

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [124]:
pd.DataFrame(data=arr1,index=['Row1','Row2','Row3','Row4','Row5'],columns=['Column1','Column2','Column3','Column4'])

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [125]:
arr2 = pd.DataFrame(data=np.random.randint(1,101,(5,4)),index=['Row1','Row2','Row3','Row4','Row5'],columns=['A','B','C','D'])

In [126]:
arr2

Unnamed: 0,A,B,C,D
Row1,5,68,35,83
Row2,76,54,98,14
Row3,88,64,6,54
Row4,94,62,19,7
Row5,77,99,48,22


In [127]:
# To get top 5 reords of a dataframe
arr2.head()

Unnamed: 0,A,B,C,D
Row1,5,68,35,83
Row2,76,54,98,14
Row3,88,64,6,54
Row4,94,62,19,7
Row5,77,99,48,22


In [128]:
# To get bottom 5 records of a dataframe
arr2.tail()

Unnamed: 0,A,B,C,D
Row1,5,68,35,83
Row2,76,54,98,14
Row3,88,64,6,54
Row4,94,62,19,7
Row5,77,99,48,22


In [129]:
# numbers of records can be given to alter the top or bottom values
print(arr2.head(2))
print("-------------------")
print(arr2.tail(2))

       A   B   C   D
Row1   5  68  35  83
Row2  76  54  98  14
-------------------
       A   B   C   D
Row4  94  62  19   7
Row5  77  99  48  22


### To get the insights of the dataframe use ---

In [130]:
arr2.describe()

Unnamed: 0,A,B,C,D
count,5.0,5.0,5.0,5.0
mean,68.0,69.4,41.2,36.0
std,36.020827,17.314734,35.506337,31.835515
min,5.0,54.0,6.0,7.0
25%,76.0,62.0,19.0,14.0
50%,77.0,64.0,35.0,22.0
75%,88.0,68.0,48.0,54.0
max,94.0,99.0,98.0,83.0


In [131]:
# To read selected columns
arr2[['A','B','C']]

Unnamed: 0,A,B,C
Row1,5,68,35
Row2,76,54,98
Row3,88,64,6
Row4,94,62,19
Row5,77,99,48


In [132]:
# To read selected rows --- for example --- if one need to read row 1 and 3 
arr2.loc[['Row1','Row2']]

Unnamed: 0,A,B,C,D
Row1,5,68,35,83
Row2,76,54,98,14


In [133]:
# To read selected rows and columns both for example --- if one need to read row 1 to 3 and column 1 and 2
arr2.iloc[0:3,0:2]

Unnamed: 0,A,B
Row1,5,68
Row2,76,54
Row3,88,64


In [134]:
# NaN is null value in data set --- NaN --- Not a Number
np.nan

nan

In [135]:
# Create data frame with nan value
arr = pd.DataFrame(data=[[1,2,np.nan,4],[5,6,7,8],[5,4,3,2]],index=['Row1,','Row2','Row3'],columns=['Column1','Column2','Column3','Column4'])
arr

Unnamed: 0,Column1,Column2,Column3,Column4
"Row1,",1,2,,4
Row2,5,6,7.0,8
Row3,5,4,3.0,2


In [136]:
# To check null values in data set
arr.isnull()

Unnamed: 0,Column1,Column2,Column3,Column4
"Row1,",False,False,True,False
Row2,False,False,False,False
Row3,False,False,False,False


In [137]:
# To check null values by count
arr.isnull().sum()

Column1    0
Column2    0
Column3    1
Column4    0
dtype: int64

In [138]:
# Alternate way to chek missing cells in the data set
arr.isna().sum()

Column1    0
Column2    0
Column3    1
Column4    0
dtype: int64

In [139]:
# To check information about dataset
arr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, Row1, to Row3
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Column1  3 non-null      int64  
 1   Column2  3 non-null      int64  
 2   Column3  2 non-null      float64
 3   Column4  3 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 120.0+ bytes


In [140]:
# to get the value count in data set --- IMP --- This applies only for Series not dataframe
print(arr)
print("\n\n----------------------------------------------")
arr['Column2'].value_counts()

       Column1  Column2  Column3  Column4
Row1,        1        2      NaN        4
Row2         5        6      7.0        8
Row3         5        4      3.0        2


----------------------------------------------


Column2
2    1
6    1
4    1
Name: count, dtype: int64

In [141]:
# To check unique value in data series
print(arr,"\n------------------------------------------")
arr['Column1'].unique()

       Column1  Column2  Column3  Column4
Row1,        1        2      NaN        4
Row2         5        6      7.0        8
Row3         5        4      3.0        2 
------------------------------------------


array([1, 5], dtype=int64)

In [142]:
# To check number of unique value
arr['Column1'].nunique()

2

In [143]:
# One more example of nunique
arr.nunique()

Column1    2
Column2    3
Column3    2
Column4    3
dtype: int64

In [144]:
# axis=0 is for columns and asix=1 is for rows in nunique
arr.nunique(axis=0)

Column1    2
Column2    3
Column3    2
Column4    3
dtype: int64

In [145]:
arr.nunique(axis=1)

Row1,    3
Row2     4
Row3     4
dtype: int64

In [146]:
# To apply indexing condition in data set
print(arr.head())
arr['Column2']>5

       Column1  Column2  Column3  Column4
Row1,        1        2      NaN        4
Row2         5        6      7.0        8
Row3         5        4      3.0        2


Row1,    False
Row2      True
Row3     False
Name: Column2, dtype: bool

In [147]:
# To print the contioned index values
print(arr)
arr[arr['Column2']>5]

       Column1  Column2  Column3  Column4
Row1,        1        2      NaN        4
Row2         5        6      7.0        8
Row3         5        4      3.0        2


Unnamed: 0,Column1,Column2,Column3,Column4
Row2,5,6,7.0,8


In [148]:
# To convert dataframe into other format use arr.to then you will get the drop down to select the option
arr.to_csv('test.csv')

In [149]:
arr.to_csv('test1.csv',index=False)

In [150]:
# To Read csv file
pd.read_csv('test1.csv')

Unnamed: 0,Column1,Column2,Column3,Column4
0,1,2,,4
1,5,6,7.0,8
2,5,4,3.0,2


In [151]:
# To convert in excel file
data = pd.DataFrame(data=np.random.randint(1,101,(5,4)),index=['Row1','Row2','Row3','Row4','Row5'],columns=['Column1','Column2','Column3','Column4'])

In [154]:
file_name='my_data.xlsx'
data.to_excel(file_name,index=False)

In [155]:
# To read excel file 
pd.read_excel('my_data.xlsx')

Unnamed: 0,Column1,Column2,Column3,Column4
0,38,21,41,26
1,87,56,7,8
2,24,68,84,35
3,54,87,23,61
4,92,66,73,45


In [162]:
# To change the seperator in csv file use sep after the file name