# Pandas


In [4]:
import pandas as pd
import numpy as np

### Playing with dataframe

In [8]:
df = pd.DataFrame(np.arange(0,20).reshape(5,4), index=['Row1','Row2','Row3','Row4','Row5'], columns=['col1','col2','col3','col4'])
df

Unnamed: 0,col1,col2,col3,col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [9]:
#Accessing Elements
df.loc['Row1']

col1    0
col2    1
col3    2
col4    3
Name: Row1, dtype: int32

In [10]:
type(df.loc['Row1'])

pandas.core.series.Series

In [11]:
df.iloc[:,:]

Unnamed: 0,col1,col2,col3,col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [12]:
df.iloc[:,1:]

Unnamed: 0,col2,col3,col4
Row1,1,2,3
Row2,5,6,7
Row3,9,10,11
Row4,13,14,15
Row5,17,18,19


In [13]:
#Converting Dataframe into array
df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [14]:
lst = [[1,2,3],[3,4,np.nan],[4,5,np.nan],[np.nan,np.nan,np.nan]]
df = pd.DataFrame(lst)

In [15]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,3.0,4.0,
2,4.0,5.0,
3,,,


### Handling missing values

In [18]:
df = pd.DataFrame(np.random.randn(5,3), index=['a','c','e','f','h'], columns=['one','two','three'])

In [19]:
df

Unnamed: 0,one,two,three
a,-0.870054,1.06463,0.037491
c,1.309883,-0.32649,-0.802223
e,0.386593,1.900354,-0.811596
f,-0.007582,-0.446503,-2.123514
h,1.120785,-2.541725,-0.477228


In [20]:
df = df.reindex(['a','b','c','d','e','f','g','h'])
df

Unnamed: 0,one,two,three
a,-0.870054,1.06463,0.037491
b,,,
c,1.309883,-0.32649,-0.802223
d,,,
e,0.386593,1.900354,-0.811596
f,-0.007582,-0.446503,-2.123514
g,,,
h,1.120785,-2.541725,-0.477228


In [21]:
df.dropna(axis=0)

Unnamed: 0,one,two,three
a,-0.870054,1.06463,0.037491
c,1.309883,-0.32649,-0.802223
e,0.386593,1.900354,-0.811596
f,-0.007582,-0.446503,-2.123514
h,1.120785,-2.541725,-0.477228


In [22]:
pd.isna(df['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [23]:
df['one'].notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

In [24]:
df.fillna('Missing')

Unnamed: 0,one,two,three
a,-0.870054,1.06463,0.037491
b,Missing,Missing,Missing
c,1.30988,-0.32649,-0.802223
d,Missing,Missing,Missing
e,0.386593,1.90035,-0.811596
f,-0.00758204,-0.446503,-2.12351
g,Missing,Missing,Missing
h,1.12078,-2.54172,-0.477228


### CSV

In [26]:
from io import StringIO, BytesIO

In [27]:
data = ('col1,col2,col3\n'
       'x,y,1\n'
       'a,b,2\n'
       'c,d,3')

In [28]:
type(data)

str

In [29]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [30]:
#Read from specific columns
df = pd.read_csv(StringIO(data),usecols = lambda x:x.upper() in ['col1','col2'])

In [31]:
df.to_csv('Test.csv')

In [36]:
# Specifying columns data types

data = ('a,b,c,d\n'
       '1,2,3,4\n'
       '5,6,7,8\n'
       '9,10,11')
print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [38]:
df = pd.read_csv(StringIO(data),dtype=object)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [40]:
type(df['a'][1])

str

In [41]:
df = pd.read_csv(StringIO(data),dtype={'b':int, 'c':float, 'a':'Int64'})

In [44]:
df.dtypes

a      Int64
b      int32
c    float64
d    float64
dtype: object

In [46]:
data = ('index,a,b,c\n'
           '4,apple,bat,5.7\n'
            '8,orange,cow,10')
pd.read_csv(StringIO(data), index_col=0)

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [47]:
data = ('a,b,c\n'
           '4,apple,bat,\n'
            '8,orange,cow,')
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,
8,orange,cow,


In [48]:
pd.read_csv(StringIO(data),index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [50]:
## Combining usecols and index_col
data = ('a,b,c\n'
           '4,apple,bat,\n'
            '8,orange,cow,')

In [54]:
pd.read_csv(StringIO(data), usecols=['b', 'c'],index_col=False)

Unnamed: 0,b,c
0,apple,bat
1,orange,cow


In [55]:
# Quoting and Escape Characters¶. Very useful in NLP

data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'

In [56]:
pd.read_csv(StringIO(data),escapechar='\\')

Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5


In [59]:
#URL to CSV

df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',sep='\t')

In [60]:
df.head()

Unnamed: 0,item_code,item_name,display_level,selectable,sort_sequence
0,AA0,All items - old base,0,T,2
1,AA0R,Purchasing power of the consumer dollar - old ...,0,T,399
2,SA0,All items,0,T,1
3,SA0E,Energy,1,T,374
4,SA0L1,All items less food,1,T,358


In [61]:
# Read Json to csv

Data = '{"employee_name": "James", "email": "james@gmail.com", "job_profile": [{"title1":"Team Lead", "title2":"Sr. Developer"}]}'
pd.read_json(Data)

Unnamed: 0,employee_name,email,job_profile
0,James,james@gmail.com,"{'title1': 'Team Lead', 'title2': 'Sr. Develop..."
