In [1]:
import numpy as np
import pandas as pd

In [2]:
# Pandas series is built directly off a numpy array
labels = ['a','b','c']

In [3]:
mylist = [10,20,30]

In [4]:
arr = np.array(mylist)

In [5]:
arr

array([10, 20, 30])

In [6]:
d = {'a':10 , 'b':20 , 'c':30}

In [7]:
pd.Series(data = mylist)

0    10
1    20
2    30
dtype: int64

In [9]:
pd.Series(data = arr, index = labels)

a    10
b    20
c    30
dtype: int32

In [10]:
ser1 = pd.Series(data = [1,2,3,4] , index = ['USA','AUS','IND','GER'])

In [11]:
ser1

USA    1
AUS    2
IND    3
GER    4
dtype: int64

In [13]:
ser1['USA']

1

In [14]:
# Operations in pandas converts it to float in the backend

In [15]:
from numpy.random import randn

In [16]:
np.random.seed(101)

In [17]:
rand_mat = randn(5,4)

In [18]:
rand_mat

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [24]:
df = pd.DataFrame(data = rand_mat, index = 'A B C D E'.split(), columns = 'W X Y Z'.split())

In [25]:
df[['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [26]:
df['NEW'] = df['W'] + df['X']

In [27]:
df

Unnamed: 0,W,X,Y,Z,NEW
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [29]:
 df.drop('NEW',axis=1,inplace =True)

In [30]:
df.drop('C',axis=0,inplace =True)

In [31]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [33]:
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [36]:
df.iloc[0:3]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


In [38]:
df.loc[['A','B']][['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077


In [39]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
D,True,False,False,True
E,True,True,True,True


In [41]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [42]:
cond1 = df['W'] > 0
cond2 = df['Y'] > 0

In [43]:
df[(cond1) & (cond2)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
E,0.190794,1.978757,2.605967,0.683509


In [45]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,D,0.188695,-0.758872,-0.933237,0.955057
3,E,0.190794,1.978757,2.605967,0.683509


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to E
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       4 non-null      float64
 1   X       4 non-null      float64
 2   Y       4 non-null      float64
 3   Z       4 non-null      float64
dtypes: float64(4)
memory usage: 332.0+ bytes


In [49]:
df.dtypes

W    float64
X    float64
Y    float64
Z    float64
dtype: object

In [50]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,4.0,4.0,4.0,4.0
mean,0.934364,0.382175,0.433156,0.687089
std,1.201506,1.211563,1.678804,0.193206
min,0.188695,-0.758872,-0.933237,0.503826
25%,0.19027,-0.429207,-0.869367,0.58043
50%,0.420956,0.154407,0.029946,0.644737
75%,1.165051,0.965789,1.332469,0.751396
max,2.70685,1.978757,2.605967,0.955057


In [51]:
#drop columns where number of non na values is less than 2.
df.dropna(thresh = 2)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [52]:
df.fillna(value = 'Missing Value')

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [53]:
#fill numerical data with mean and categorical data with mode