## Pandas DataFrame Basics
#### DataFrame is a tabular spreadsheet like structure representing rows each of which contains one or more multpile columns

In [1]:
import numpy as np
import pandas as pd

In [2]:
dict1 = {
    "Name":['Pranab','Debdoot','Amit','Atul'],
    "Location":['Habra','Jadavpur','Bilaspur','Patna'],
    "Age":[23,24,25,25]
}

In [3]:
# DataFrame is a function of pandas library to make tabular representation of data
df = pd.DataFrame(dict1)

In [4]:
df

Unnamed: 0,Name,Location,Age
0,Pranab,Habra,23
1,Debdoot,Jadavpur,24
2,Amit,Bilaspur,25
3,Atul,Patna,25


In [5]:
# convert the current dataframe to a csv file
df.to_csv('Friends.csv')

In [6]:
# make the index false for the csv file
df.to_csv('FriendsIndexFalse.csv', index = False)

In [7]:
# show only the first two rows 
df.head(2)

Unnamed: 0,Name,Location,Age
0,Pranab,Habra,23
1,Debdoot,Jadavpur,24


In [8]:
# show only the last two rows
df.tail(2)

Unnamed: 0,Name,Location,Age
2,Amit,Bilaspur,25
3,Atul,Patna,25


In [9]:
# show the statistics of the numeric datatype column
df.describe()

Unnamed: 0,Age
count,4.0
mean,24.25
std,0.957427
min,23.0
25%,23.75
50%,24.5
75%,25.0
max,25.0


In [10]:
# read the csv file using the pandas library function read_csv()
friends = pd.read_csv('FriendsIndexFalse.csv')

In [11]:
friends

Unnamed: 0,Name,Location,Age
0,Pranab,Habra,23
1,Debdoot,Jadavpur,24
2,Amit,Bilaspur,25
3,Atul,Patna,25


In [12]:
# showing only that specified column values
friends['Location']

0       Habra
1    Jadavpur
2    Bilaspur
3       Patna
Name: Location, dtype: object

In [13]:
friends['Location'][1]

'Jadavpur'

In [14]:
# changing the specified value
friends['Location'][2] = 'Kolkata'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  friends['Location'][2] = 'Kolkata'


In [15]:
friends

Unnamed: 0,Name,Location,Age
0,Pranab,Habra,23
1,Debdoot,Jadavpur,24
2,Amit,Kolkata,25
3,Atul,Patna,25


In [16]:
# change the index naming
friends.index = ['First','Second','Third','Fourth']

In [17]:
friends

Unnamed: 0,Name,Location,Age
First,Pranab,Habra,23
Second,Debdoot,Jadavpur,24
Third,Amit,Kolkata,25
Fourth,Atul,Patna,25


## Pandas Series 
#### Series is an one dimensional array with indexes, it stores a single column or row of data in a dataFrame

In [18]:
# generates a series of values from 0 to 1 randomly
ser = pd.Series(np.random.rand(34))

In [19]:
ser

0     0.023423
1     0.980643
2     0.974188
3     0.452164
4     0.505837
5     0.377788
6     0.440599
7     0.301555
8     0.669206
9     0.601121
10    0.307853
11    0.950797
12    0.420925
13    0.523680
14    0.092075
15    0.467503
16    0.523091
17    0.541393
18    0.347914
19    0.831032
20    0.493493
21    0.709083
22    0.526841
23    0.919452
24    0.506687
25    0.180153
26    0.526110
27    0.730552
28    0.875159
29    0.147209
30    0.646107
31    0.665312
32    0.821477
33    0.087425
dtype: float64

In [20]:
type(ser)

pandas.core.series.Series

In [21]:
# creates a DF with 334 rows and 5 columns and index is from 0 to 333
newdf = pd.DataFrame(np.random.rand(334,5), index = np.arange(334))

In [22]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.195118,0.941653,0.985709,0.72073,0.553241
1,0.187938,0.815946,0.703573,0.659829,0.044706
2,0.740013,0.226902,0.513519,0.805808,0.353463
3,0.653254,0.620766,0.896162,0.64462,0.577401
4,0.429975,0.929095,0.243075,0.393499,0.504341


In [23]:
newdf.describe()

Unnamed: 0,0,1,2,3,4
count,334.0,334.0,334.0,334.0,334.0
mean,0.47971,0.497786,0.512489,0.526594,0.482754
std,0.284205,0.30405,0.297927,0.296189,0.283766
min,0.000316,0.000976,0.005502,0.000361,0.000414
25%,0.238093,0.215546,0.234856,0.267001,0.233716
50%,0.475131,0.532904,0.521598,0.569783,0.47492
75%,0.703701,0.780779,0.7677,0.778828,0.726815
max,0.999463,0.99942,0.994818,0.999639,0.999477


In [24]:
# show the data types of the columns
newdf.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [25]:
# if you want to add multiple datatypes in single column pandas will consider the type to be 'object'
# newdf[0][0] = 'Pranab'

In [26]:
# accessing individual elements
newdf[0][0]

0.19511817977551216

In [27]:
# show datatypes of each column
newdf.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [28]:
# show the indexes of the dataFrame
newdf.index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       324, 325, 326, 327, 328, 329, 330, 331, 332, 333],
      dtype='int64', length=334)

In [29]:
# describe the columns starts with 0 and total 5 columns are there with step size 1.
newdf.columns

RangeIndex(start=0, stop=5, step=1)

In [30]:
# create a numpy array
numpy_array = newdf.to_numpy()
print(type(numpy_array))
numpy_array

<class 'numpy.ndarray'>


array([[1.95118180e-01, 9.41652527e-01, 9.85709020e-01, 7.20730139e-01,
        5.53240611e-01],
       [1.87938379e-01, 8.15945895e-01, 7.03572660e-01, 6.59828637e-01,
        4.47056230e-02],
       [7.40013251e-01, 2.26902346e-01, 5.13519307e-01, 8.05807935e-01,
        3.53462712e-01],
       ...,
       [1.13372559e-02, 9.76438009e-04, 7.14248114e-01, 2.89011177e-01,
        7.85065193e-01],
       [1.34169709e-01, 4.71839926e-01, 2.76959777e-01, 2.19912186e-01,
        5.84104287e-01],
       [9.93839281e-01, 8.30668746e-01, 9.58827600e-01, 9.82874981e-01,
        8.75437652e-01]])

In [31]:
type(newdf)

pandas.core.frame.DataFrame

In [32]:
# Transpose the dataFrame
newdf.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
0,0.195118,0.187938,0.740013,0.653254,0.429975,0.688303,0.634887,0.705154,0.540657,0.380436,...,0.356418,0.251621,0.666277,0.414037,0.356087,0.399247,0.497839,0.011337,0.13417,0.993839
1,0.941653,0.815946,0.226902,0.620766,0.929095,0.152913,0.654511,0.371905,0.332694,0.22254,...,0.58238,0.440103,0.78259,0.881088,0.542534,0.552305,0.875757,0.000976,0.47184,0.830669
2,0.985709,0.703573,0.513519,0.896162,0.243075,0.968249,0.738077,0.804559,0.728041,0.607563,...,0.04798,0.489559,0.625971,0.355907,0.693858,0.040351,0.154169,0.714248,0.27696,0.958828
3,0.72073,0.659829,0.805808,0.64462,0.393499,0.683286,0.168251,0.81446,0.040871,0.365779,...,0.792783,0.624288,0.098856,0.693621,0.916243,0.403411,0.403356,0.289011,0.219912,0.982875
4,0.553241,0.044706,0.353463,0.577401,0.504341,0.833706,0.911802,0.875693,0.270008,0.880836,...,0.702334,0.848295,0.130054,0.21558,0.25868,0.463399,0.060317,0.785065,0.584104,0.875438


In [33]:
newdf.tail()

Unnamed: 0,0,1,2,3,4
329,0.399247,0.552305,0.040351,0.403411,0.463399
330,0.497839,0.875757,0.154169,0.403356,0.060317
331,0.011337,0.000976,0.714248,0.289011,0.785065
332,0.13417,0.47184,0.27696,0.219912,0.584104
333,0.993839,0.830669,0.958828,0.982875,0.875438


In [34]:
# Sort the array index row wise Descending
newdf.sort_index(axis = 0, ascending = False).head()

Unnamed: 0,0,1,2,3,4
333,0.993839,0.830669,0.958828,0.982875,0.875438
332,0.13417,0.47184,0.27696,0.219912,0.584104
331,0.011337,0.000976,0.714248,0.289011,0.785065
330,0.497839,0.875757,0.154169,0.403356,0.060317
329,0.399247,0.552305,0.040351,0.403411,0.463399


In [35]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.195118,0.941653,0.985709,0.72073,0.553241
1,0.187938,0.815946,0.703573,0.659829,0.044706
2,0.740013,0.226902,0.513519,0.805808,0.353463
3,0.653254,0.620766,0.896162,0.64462,0.577401
4,0.429975,0.929095,0.243075,0.393499,0.504341


In [36]:
# Sort the array index column wise Descending
newdf.sort_index(axis = 1, ascending = False).head()

Unnamed: 0,4,3,2,1,0
0,0.553241,0.72073,0.985709,0.941653,0.195118
1,0.044706,0.659829,0.703573,0.815946,0.187938
2,0.353463,0.805808,0.513519,0.226902,0.740013
3,0.577401,0.64462,0.896162,0.620766,0.653254
4,0.504341,0.393499,0.243075,0.929095,0.429975


In [37]:
# print the 0th column of the dataFrame
newdf[0]

0      0.195118
1      0.187938
2      0.740013
3      0.653254
4      0.429975
         ...   
329    0.399247
330    0.497839
331    0.011337
332    0.134170
333    0.993839
Name: 0, Length: 334, dtype: float64

In [38]:
# create a view of the dataFrame
# it is a copy of the dataFrame if i change values with newdf2 then this change will also occur at newdf
# newdf2[0][0] = 765 will similarly change the newdf[0][0] value.
newdf2 = newdf

In [39]:
# copy one dataframe to another it will be a separate dataFrame not like view
newdf3 = newdf.copy()

In [40]:
newdf3.head()

Unnamed: 0,0,1,2,3,4
0,0.195118,0.941653,0.985709,0.72073,0.553241
1,0.187938,0.815946,0.703573,0.659829,0.044706
2,0.740013,0.226902,0.513519,0.805808,0.353463
3,0.653254,0.620766,0.896162,0.64462,0.577401
4,0.429975,0.929095,0.243075,0.393499,0.504341


In [41]:
# whenever we modify some value in DataFrame it shows a warning because when we modify the
# DataFrame the pandas internal library gives us view or copy of the dataframe based on internal memory mgt.
# loc modify the original dataFrame
newdf.loc[0,0] = 0.209124

In [42]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.209124,0.941653,0.985709,0.72073,0.553241
1,0.187938,0.815946,0.703573,0.659829,0.044706
2,0.740013,0.226902,0.513519,0.805808,0.353463
3,0.653254,0.620766,0.896162,0.64462,0.577401
4,0.429975,0.929095,0.243075,0.393499,0.504341


In [43]:
# if loc not exists it will make that location and add the value
newdf.loc[0,5] = 0.209124

In [44]:
newdf.head()

Unnamed: 0,0,1,2,3,4,5
0,0.209124,0.941653,0.985709,0.72073,0.553241,0.209124
1,0.187938,0.815946,0.703573,0.659829,0.044706,
2,0.740013,0.226902,0.513519,0.805808,0.353463,
3,0.653254,0.620766,0.896162,0.64462,0.577401,
4,0.429975,0.929095,0.243075,0.393499,0.504341,


In [45]:
# Drop the 5th column(axis = 1) if axis=0 is given then it will remove the first row
newdf = newdf.drop(5, axis = 1)
newdf

Unnamed: 0,0,1,2,3,4
0,0.209124,0.941653,0.985709,0.720730,0.553241
1,0.187938,0.815946,0.703573,0.659829,0.044706
2,0.740013,0.226902,0.513519,0.805808,0.353463
3,0.653254,0.620766,0.896162,0.644620,0.577401
4,0.429975,0.929095,0.243075,0.393499,0.504341
...,...,...,...,...,...
329,0.399247,0.552305,0.040351,0.403411,0.463399
330,0.497839,0.875757,0.154169,0.403356,0.060317
331,0.011337,0.000976,0.714248,0.289011,0.785065
332,0.134170,0.471840,0.276960,0.219912,0.584104


In [46]:
# access certain rows,column pair values
newdf.loc[[0,1],[3,4]]

Unnamed: 0,3,4
0,0.72073,0.553241
1,0.659829,0.044706


In [47]:
# Normal python indexing can be done in accessing
# take the values from the 1st column to the 3rd column
newdf.loc[:,1:3].head()

Unnamed: 0,1,2,3
0,0.941653,0.985709,0.72073
1,0.815946,0.703573,0.659829
2,0.226902,0.513519,0.805808
3,0.620766,0.896162,0.64462
4,0.929095,0.243075,0.393499


In [48]:
# applying conditions on DataFrames
# select the rows which have 1st column value less than specified threshold
newdf.loc[(newdf[1] < 0.04)]

Unnamed: 0,0,1,2,3,4
12,0.59416,0.014572,0.723188,0.045576,0.639447
51,0.789486,0.008211,0.730933,0.12516,0.948833
59,0.933533,0.010939,0.321632,0.361597,0.08809
77,0.562861,0.039462,0.732106,0.246884,0.845538
91,0.71436,0.015664,0.265168,0.878676,0.66517
94,0.024025,0.033508,0.903203,0.675019,0.508986
107,0.25635,0.037123,0.01224,0.272079,0.860571
113,0.105502,0.005072,0.237442,0.79741,0.444133
119,0.065602,0.016657,0.515011,0.408513,0.000414
139,0.683682,0.005786,0.526489,0.000361,0.47612


In [49]:
# allowing multiple conditions
newdf.loc[(newdf[1] < 0.04) & (newdf[3] > 0.6)]

Unnamed: 0,0,1,2,3,4
91,0.71436,0.015664,0.265168,0.878676,0.66517
94,0.024025,0.033508,0.903203,0.675019,0.508986
113,0.105502,0.005072,0.237442,0.79741,0.444133
184,0.367397,0.01381,0.041137,0.840385,0.635158
254,0.065815,0.015801,0.918293,0.759699,0.143368
295,0.568125,0.020397,0.910518,0.775257,0.519


In [50]:
# whatever be the column names it will give the 0th rows 3rd Col value.
newdf.iloc[0,3]

0.7207301391355404

In [51]:
# similar slices can be done with iloc
newdf.iloc[[0,1],[3,4]]

Unnamed: 0,3,4
0,0.72073,0.553241
1,0.659829,0.044706


In [52]:
# recent view
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.209124,0.941653,0.985709,0.72073,0.553241
1,0.187938,0.815946,0.703573,0.659829,0.044706
2,0.740013,0.226902,0.513519,0.805808,0.353463
3,0.653254,0.620766,0.896162,0.64462,0.577401
4,0.429975,0.929095,0.243075,0.393499,0.504341


In [53]:
# If no axis is defined default axis=0
newdf.drop([0]).head()

Unnamed: 0,0,1,2,3,4
1,0.187938,0.815946,0.703573,0.659829,0.044706
2,0.740013,0.226902,0.513519,0.805808,0.353463
3,0.653254,0.620766,0.896162,0.64462,0.577401
4,0.429975,0.929095,0.243075,0.393499,0.504341
5,0.688303,0.152913,0.968249,0.683286,0.833706


In [54]:
# Drop multiple rows
# it returns the copy of the dataFrame doesn't change the original dataFrame
newdf.drop([0,1], axis=1).head()

Unnamed: 0,2,3,4
0,0.985709,0.72073,0.553241
1,0.703573,0.659829,0.044706
2,0.513519,0.805808,0.353463
3,0.896162,0.64462,0.577401
4,0.243075,0.393499,0.504341


In [55]:
# To change the original dataFrame use inplace
newdf.drop([0,1], axis=1, inplace=True)
newdf.head()

Unnamed: 0,2,3,4
0,0.985709,0.72073,0.553241
1,0.703573,0.659829,0.044706
2,0.513519,0.805808,0.353463
3,0.896162,0.64462,0.577401
4,0.243075,0.393499,0.504341


In [56]:
# newdf.reset_index(inplace=True)
# newdf.head()

In [57]:
# if row number if not in proper order some of them are deleted and if you want to reset the index consecutively then
newdf.reset_index(drop=True, inplace=True)
newdf.head()

Unnamed: 0,2,3,4
0,0.985709,0.72073,0.553241
1,0.703573,0.659829,0.044706
2,0.513519,0.805808,0.353463
3,0.896162,0.64462,0.577401
4,0.243075,0.393499,0.504341


In [59]:
# To remove the index column explicitly
# newdf.drop(['index'], axis=1, inplace=True)
# newdf.head()

In [60]:
# reset the column values
newdf.columns = [0, 1, 2]
newdf.head()

Unnamed: 0,0,1,2
0,0.985709,0.72073,0.553241
1,0.703573,0.659829,0.044706
2,0.513519,0.805808,0.353463
3,0.896162,0.64462,0.577401
4,0.243075,0.393499,0.504341


In [61]:
# make the 0th column value to 0 for all rows
newdf.loc[:, [0]] = 0

In [63]:
newdf.head()

Unnamed: 0,0,1,2
0,0.0,0.72073,0.553241
1,0.0,0.659829,0.044706
2,0.0,0.805808,0.353463
3,0.0,0.64462,0.577401
4,0.0,0.393499,0.504341


In [64]:
newdf.loc[:, [0]] = None
newdf.head()

Unnamed: 0,0,1,2
0,,0.72073,0.553241
1,,0.659829,0.044706
2,,0.805808,0.353463
3,,0.64462,0.577401
4,,0.393499,0.504341


In [68]:
# if null is present then show true otherwise false
newdf[0].isnull()

0      True
1      True
2      True
3      True
4      True
       ... 
329    True
330    True
331    True
332    True
333    True
Name: 0, Length: 334, dtype: bool