In [1]:
import pandas as pd 
import numpy as np
df = pd.DataFrame(np.random.randn(6,4), index=list(range(6)), columns=list('ABCD'))                       #data frame
df

Unnamed: 0,A,B,C,D
0,-0.856463,0.908486,0.638407,0.163357
1,0.788577,-0.033347,2.247374,-1.767236
2,-0.191802,-0.898904,0.827881,0.28425
3,0.905868,0.927352,1.900433,-0.110209
4,-0.811969,-0.722878,-0.779944,0.511035
5,2.016022,0.988759,0.167592,-0.515689


In [2]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.308372,0.194911,0.833624,-0.239082
std,1.128348,0.867954,1.115537,0.827875
min,-0.856463,-0.898904,-0.779944,-1.767236
25%,-0.656927,-0.550495,0.285296,-0.414319
50%,0.298387,0.437569,0.733144,0.026574
75%,0.876545,0.922636,1.632295,0.254027
max,2.016022,0.988759,2.247374,0.511035


In [3]:
#to check pandas version
print(pd.__version__)       

1.0.1


# Creating series from a list

In [4]:
a = [0,2 ,4 ,6, 8]
s1 = pd.Series(a)
s1

0    0
1    2
2    4
3    6
4    8
dtype: int64

In [5]:
b = [1, 2, 3, 4, 5]
s2 = pd.Series(a, index =  b)
s2

1    0
2    2
3    4
4    6
5    8
dtype: int64

# Creating series from a random N array

In [6]:
n = np.random.randn(5)
index = ['a', 'b', 'c', 'd', 'e']
s2 = pd.Series(n,index = index)
s2

a   -0.074850
b    1.531803
c    0.008623
d   -0.890674
e    0.416961
dtype: float64

# Creating series from  dictionary

In [7]:
d = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}      #key:value
s3 = pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [8]:
# modifing the index of the series
print(s1)
s1.index = ['A', 'B', 'C', 'D', 'E']
s1

0    0
1    2
2    4
3    6
4    8
dtype: int64


A    0
B    2
C    4
D    6
E    8
dtype: int64

# Slicing the data

In [9]:
s1[:3]          # doesn't change the origional values

A    0
B    2
C    4
dtype: int64

In [10]:
s1[:-2] 

A    0
B    2
C    4
dtype: int64

In [11]:
s1[2:]

C    4
D    6
E    8
dtype: int64

In [12]:
s1[-2:]

D    6
E    8
dtype: int64

In [13]:
s1

A    0
B    2
C    4
D    6
E    8
dtype: int64

In [14]:
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

 # Appending series

In [15]:
s4 = s1.append(s3)        #appending series
s4

A    0
B    2
C    4
D    6
E    8
a    1
b    2
c    3
d    4
e    5
dtype: int64

# Deleting 

In [16]:
s4.drop('c')        # deleting 

A    0
B    2
C    4
D    6
E    8
a    1
b    2
d    4
e    5
dtype: int64

# Series operations

In [17]:
arr1 = [1, 2 ,3 , 4, 5, 7, 12]
arr2 = [6, 7 , 8, 9 ,5]
s5 = pd. Series(arr2)
s5

0    6
1    7
2    8
3    9
4    5
dtype: int64

In [18]:
s6 = pd. Series(arr1)
s6

0     1
1     2
2     3
3     4
4     5
5     7
6    12
dtype: int64

In [19]:
s5.add(s6)

0     7.0
1     9.0
2    11.0
3    13.0
4    10.0
5     NaN
6     NaN
dtype: float64

In [20]:
s5.sub(s6)

0    5.0
1    5.0
2    5.0
3    5.0
4    0.0
5    NaN
6    NaN
dtype: float64

In [21]:
s5.mul(s6)

0     6.0
1    14.0
2    24.0
3    36.0
4    25.0
5     NaN
6     NaN
dtype: float64

In [22]:
s5.div(s6)

0    6.000000
1    3.500000
2    2.666667
3    2.250000
4    1.000000
5         NaN
6         NaN
dtype: float64

In [23]:
print("Median:", s6.median())
print("Max:", s6.max())
print("Min:", s6.min())

Median: 4.0
Max: 12
Min: 1


# Create Dataframe

In [24]:
dates = pd.date_range('today', periods=6)   #Define sequence as index
dates

DatetimeIndex(['2020-06-25 23:01:23.224302', '2020-06-26 23:01:23.224302',
               '2020-06-27 23:01:23.224302', '2020-06-28 23:01:23.224302',
               '2020-06-29 23:01:23.224302', '2020-06-30 23:01:23.224302'],
              dtype='datetime64[ns]', freq='D')

In [25]:
num_arr = np.random.rand(6,4)      #import numpy random array
num_arr

array([[0.14375237, 0.13642054, 0.21382019, 0.75343855],
       [0.6713646 , 0.36439845, 0.2355784 , 0.62368094],
       [0.79728163, 0.03650173, 0.97071942, 0.58754371],
       [0.60753144, 0.54824507, 0.62692353, 0.16423287],
       [0.25825348, 0.10877201, 0.34865351, 0.12316325],
       [0.58249654, 0.74091297, 0.60375263, 0.13627478]])

In [26]:
columns = ['A', 'B', 'C', 'D']    #use the table as the column name
columns

['A', 'B', 'C', 'D']

In [27]:
df1 = pd.DataFrame(num_arr, index=dates, columns = columns)
df1

Unnamed: 0,A,B,C,D
2020-06-25 23:01:23.224302,0.143752,0.136421,0.21382,0.753439
2020-06-26 23:01:23.224302,0.671365,0.364398,0.235578,0.623681
2020-06-27 23:01:23.224302,0.797282,0.036502,0.970719,0.587544
2020-06-28 23:01:23.224302,0.607531,0.548245,0.626924,0.164233
2020-06-29 23:01:23.224302,0.258253,0.108772,0.348654,0.123163
2020-06-30 23:01:23.224302,0.582497,0.740913,0.603753,0.136275


In [28]:
# Creating dataframe with dictonary array
data = { 'animal':['cat', 'dog', 'cat', 'rat', 'fish'],
       'age':[5.4, 3, 4, np.nan, 2],
       'visits': [1,4,6,2,8],
       'priority':['yes', 'yes', 'yes', 'no', 'no']}
labels= ['a', 'b', 'c', 'd', 'e']

df2 = pd.DataFrame(data, index = labels)
df2

Unnamed: 0,animal,age,visits,priority
a,cat,5.4,1,yes
b,dog,3.0,4,yes
c,cat,4.0,6,yes
d,rat,,2,no
e,fish,2.0,8,no


In [29]:
df2.describe()            #shows statistical data of dataframe

Unnamed: 0,age,visits
count,4.0,5.0
mean,3.6,4.2
std,1.451436,2.863564
min,2.0,1.0
25%,2.75,2.0
50%,3.5,4.0
75%,4.35,6.0
max,5.4,8.0


In [30]:
df2.dtypes       #to check datatypes of array used

animal       object
age         float64
visits        int64
priority     object
dtype: object

In [31]:
df3 = df2.head(3) #shows first few rows

In [32]:
df3

Unnamed: 0,animal,age,visits,priority
a,cat,5.4,1,yes
b,dog,3.0,4,yes
c,cat,4.0,6,yes


In [33]:
df2.tail(3)    #shows last few rows

Unnamed: 0,animal,age,visits,priority
c,cat,4.0,6,yes
d,rat,,2,no
e,fish,2.0,8,no


In [34]:
df2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [35]:
df2.columns

Index(['animal', 'age', 'visits', 'priority'], dtype='object')

In [36]:
df2.values

array([['cat', 5.4, 1, 'yes'],
       ['dog', 3.0, 4, 'yes'],
       ['cat', 4.0, 6, 'yes'],
       ['rat', nan, 2, 'no'],
       ['fish', 2.0, 8, 'no']], dtype=object)

In [37]:
df

Unnamed: 0,A,B,C,D
0,-0.856463,0.908486,0.638407,0.163357
1,0.788577,-0.033347,2.247374,-1.767236
2,-0.191802,-0.898904,0.827881,0.28425
3,0.905868,0.927352,1.900433,-0.110209
4,-0.811969,-0.722878,-0.779944,0.511035
5,2.016022,0.988759,0.167592,-0.515689


In [38]:
df2.T        #data transpose    index<--->col names

Unnamed: 0,a,b,c,d,e
animal,cat,dog,cat,rat,fish
age,5.4,3,4,,2
visits,1,4,6,2,8
priority,yes,yes,yes,no,no


In [39]:
df2.sort_values(by = 'age')

Unnamed: 0,animal,age,visits,priority
e,fish,2.0,8,no
b,dog,3.0,4,yes
c,cat,4.0,6,yes
a,cat,5.4,1,yes
d,rat,,2,no


In [40]:
df2[2:5]      #slicing dataframe

Unnamed: 0,animal,age,visits,priority
c,cat,4.0,6,yes
d,rat,,2,no
e,fish,2.0,8,no


In [41]:
df2.sort_values(by = 'age')[2:5]

Unnamed: 0,animal,age,visits,priority
c,cat,4.0,6,yes
a,cat,5.4,1,yes
d,rat,,2,no


In [42]:
df2.iloc[1:3]               #similar like slicing , query rows 2,3

Unnamed: 0,animal,age,visits,priority
b,dog,3.0,4,yes
c,cat,4.0,6,yes


In [43]:
df2[['age', 'visits']]       #query datafeame bt tag

Unnamed: 0,age,visits
a,5.4,1
b,3.0,4
c,4.0,6
d,,2
e,2.0,8


In [44]:
df3 = df2.copy()
df3

Unnamed: 0,animal,age,visits,priority
a,cat,5.4,1,yes
b,dog,3.0,4,yes
c,cat,4.0,6,yes
d,rat,,2,no
e,fish,2.0,8,no


In [45]:
df3.isnull()

Unnamed: 0,animal,age,visits,priority
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,False,True,False,False
e,False,False,False,False


In [46]:
df3.loc['a', 'age'] = 9.6                #change in the actual dataframe
df3

Unnamed: 0,animal,age,visits,priority
a,cat,9.6,1,yes
b,dog,3.0,4,yes
c,cat,4.0,6,yes
d,rat,,2,no
e,fish,2.0,8,no


In [47]:
df3.mean()

age       4.65
visits    4.20
dtype: float64

In [48]:
df3['visits']

a    1
b    4
c    6
d    2
e    8
Name: visits, dtype: int64

In [49]:
df3['visits'].min()

1

In [50]:
df3['visits'].max()

8

In [51]:
df3['visits'].sum()

21

# Creating string series

In [52]:
# creating string series
string = pd.Series(['A', 'C', 'D', 'Aaa', 'BaCa', np.nan, 'CBA', 'cow', 'owl'])
string

0       A
1       C
2       D
3     Aaa
4    BaCa
5     NaN
6     CBA
7     cow
8     owl
dtype: object

In [53]:
string.str.lower()

0       a
1       c
2       d
3     aaa
4    baca
5     NaN
6     cba
7     cow
8     owl
dtype: object

In [54]:
string.str.upper()

0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object

# Operations for Dtatframe missing values

In [55]:
# Operations for Dtatframe missing values
df4 = df3.copy()
df4

Unnamed: 0,animal,age,visits,priority
a,cat,9.6,1,yes
b,dog,3.0,4,yes
c,cat,4.0,6,yes
d,rat,,2,no
e,fish,2.0,8,no


In [56]:
df4.fillna(829)        #fills  all the NaN

Unnamed: 0,animal,age,visits,priority
a,cat,9.6,1,yes
b,dog,3.0,4,yes
c,cat,4.0,6,yes
d,rat,829.0,2,no
e,fish,2.0,8,no


In [57]:
meanAge = df4['age'].mean()
df4['age'].fillna(25)

a     9.6
b     3.0
c     4.0
d    25.0
e     2.0
Name: age, dtype: float64

In [58]:
df5 = df3.copy()
df5.dropna(how = 'any')      #drops / delete's NaN row

Unnamed: 0,animal,age,visits,priority
a,cat,9.6,1,yes
b,dog,3.0,4,yes
c,cat,4.0,6,yes
e,fish,2.0,8,no


# Dataframe File Operation

In [59]:
df3.to_csv('animal.csv')                    #Creates a file 

In [60]:
df_animal = pd.read_csv('animal.csv')
df_animal.head(3)                           #or use print(df_animal) to print everything

Unnamed: 0.1,Unnamed: 0,animal,age,visits,priority
0,a,cat,9.6,1,yes
1,b,dog,3.0,4,yes
2,c,cat,4.0,6,yes


In [61]:
df3.to_excel('animal.xlsx', sheet_name ='Sheet1')
df_animal2 = pd.read_excel('animal.xlsx', 'Sheet1', index_col = None, na_values= ['NA'])
df_animal2

Unnamed: 0.1,Unnamed: 0,animal,age,visits,priority
0,a,cat,9.6,1,yes
1,b,dog,3.0,4,yes
2,c,cat,4.0,6,yes
3,d,rat,,2,no
4,e,fish,2.0,8,no
