In [1]:
import numpy as np
import pandas as pd

In [3]:
np.random.seed(8)
arr = np.random.randint(0,100,25)

In [4]:
arr

array([67, 84,  5, 90,  8, 83, 63, 48, 85, 60, 49, 74, 27, 13,  9, 61, 15,
       93, 98, 59, 18, 14, 93, 56,  9])

In [5]:
#pandas series ~ numpy array
pd.Series(arr) #analogous to a vector / 1D array / single column 

0     67
1     84
2      5
3     90
4      8
5     83
6     63
7     48
8     85
9     60
10    49
11    74
12    27
13    13
14     9
15    61
16    15
17    93
18    98
19    59
20    18
21    14
22    93
23    56
24     9
dtype: int64

In [6]:
# creating a pandas dataframe => a series of pandas.Series / a series of columns
# create a 5 X 5 table from the arr above
df = pd.DataFrame(arr.reshape(5,5))
df

Unnamed: 0,0,1,2,3,4
0,67,84,5,90,8
1,83,63,48,85,60
2,49,74,27,13,9
3,61,15,93,98,59
4,18,14,93,56,9


In [10]:
df = pd.DataFrame(arr.reshape(5,5), index = np.arange(1,6), columns = 'A B C D E'.split())
df

Unnamed: 0,A,B,C,D,E
1,67,84,5,90,8
2,83,63,48,85,60
3,49,74,27,13,9
4,61,15,93,98,59
5,18,14,93,56,9


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       5 non-null      int64
 1   B       5 non-null      int64
 2   C       5 non-null      int64
 3   D       5 non-null      int64
 4   E       5 non-null      int64
dtypes: int64(5)
memory usage: 240.0 bytes


In [12]:
df.describe() #to check the descriptive stats

Unnamed: 0,A,B,C,D,E
count,5.0,5.0,5.0,5.0,5.0
mean,55.6,50.0,53.2,68.4,29.0
std,24.32694,33.24906,39.385276,34.789366,27.847801
min,18.0,14.0,5.0,13.0,8.0
25%,49.0,15.0,27.0,56.0,9.0
50%,61.0,63.0,48.0,85.0,9.0
75%,67.0,74.0,93.0,90.0,59.0
max,83.0,84.0,93.0,98.0,60.0


In [14]:
# Dataframe indexing almost similar to numpy
type(df['A'])

pandas.core.series.Series

In [15]:
df['A'] #This indexing must be the name of the column

1    67
2    83
3    49
4    61
5    18
Name: A, dtype: int64

In [17]:
df['B']

1    84
2    63
3    74
4    15
5    14
Name: B, dtype: int64

In [20]:
# you can also index pandas dataframe as how you would in SQL
df.A

1    67
2    83
3    49
4    61
5    18
Name: A, dtype: int64

In [21]:
# multidimension indexing => use double []
df[['A','C']]

Unnamed: 0,A,C
1,67,5
2,83,48
3,49,27
4,61,93
5,18,93


In [22]:
df.index

Int64Index([1, 2, 3, 4, 5], dtype='int64')

In [23]:
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [27]:
# To index rows instead of columns, use iloc
df.iloc[1]

A    83
B    63
C    48
D    85
E    60
Name: 2, dtype: int64

In [26]:
# iloc uses the python indexing, i.e starts from 0
# to use the row name, use loc
df.loc[1]

A    67
B    84
C     5
D    90
E     8
Name: 1, dtype: int64

In [28]:
df2 = pd.DataFrame(arr.reshape(5,5), index = ['one','two','three','four','five'], columns = 'A B C D E'.split())
df2

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [29]:
df2.iloc[0]

A    67
B    84
C     5
D    90
E     8
Name: one, dtype: int64

In [30]:
df2.loc['one']

A    67
B    84
C     5
D    90
E     8
Name: one, dtype: int64

In [31]:
# creating a new column
df['A+B'] = df['A'] + df['B']
df

Unnamed: 0,A,B,C,D,E,A+B
1,67,84,5,90,8,151
2,83,63,48,85,60,146
3,49,74,27,13,9,123
4,61,15,93,98,59,76
5,18,14,93,56,9,32


In [32]:
df2

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [33]:
#create a copy of dataframe to not lose its original content
df3 = df2.copy()

In [34]:
df3['A+B'] = df3['A'] + df3['B']

In [35]:
df3

Unnamed: 0,A,B,C,D,E,A+B
one,67,84,5,90,8,151
two,83,63,48,85,60,146
three,49,74,27,13,9,123
four,61,15,93,98,59,76
five,18,14,93,56,9,32


In [36]:
df2

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [42]:
#deleting a column
df3.drop('A+B',axis =1)

KeyError: "['A+B'] not found in axis"

In [39]:
#even after dropping the column will still be around for data persistence
df3

Unnamed: 0,A,B,C,D,E,A+B
one,67,84,5,90,8,151
two,83,63,48,85,60,146
three,49,74,27,13,9,123
four,61,15,93,98,59,76
five,18,14,93,56,9,32


In [40]:
#to drop permanently, use inplace=True
df3.drop('A+B', axis=1, inplace=True)

In [41]:
df3

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9
