In [1]:
# pandas

# Series
# A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index.
# The simplest Series is formed from only an array of data:

import pandas as pd
import numpy as np

end = '\n'+"-"*100+'\n'

pd_series = pd.Series([1,2,3,4,-5])
print(pd_series,end=end)

# The string representation of a Series displayed interactively shows the index on the left and the values on the right. Since we did not specify an index for the data, a default one consisting of the integers 0 through N - 1 (where N is the length of the data) is created.

print(pd_series.values,end=end) # to get the values of the series 
print(pd_series.index,end=end) # to get the index values of the series

0    1
1    2
2    3
3    4
4   -5
dtype: int64
----------------------------------------------------------------------------------------------------
[ 1  2  3  4 -5]
----------------------------------------------------------------------------------------------------
RangeIndex(start=0, stop=5, step=1)
----------------------------------------------------------------------------------------------------


In [2]:
# Often it will be desirable to create a Series with an index identifying each data point with a label:
pd_series = pd.Series([1,2,3,4,-5],index=list('abcde'))
print(pd_series,end=end)
print(pd_series.index,end=end)

# Compared with NumPy arrays, you can use labels in the index when selecting single values or a set of values
print(pd_series['c'],end=end)

pd_series['c'] = 12 # Assigning new values

print(pd_series[['c','d','a']]) # accessing multiple values and reindex 

a    1
b    2
c    3
d    4
e   -5
dtype: int64
----------------------------------------------------------------------------------------------------
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
----------------------------------------------------------------------------------------------------
3
----------------------------------------------------------------------------------------------------
c    12
d     4
a     1
dtype: int64


In [3]:
# Using NumPy functions or NumPy-like operations, such as filtering with a boolean array, scalar multiplication, or applying math functions, will preserve the index-value link:

print(pd_series[pd_series >= 4],end=end)
print(pd_series*10,end=end)
print(pd_series+10,end=end)
print(np.exp(pd_series))

c    12
d     4
dtype: int64
----------------------------------------------------------------------------------------------------
a     10
b     20
c    120
d     40
e    -50
dtype: int64
----------------------------------------------------------------------------------------------------
a    11
b    12
c    22
d    14
e     5
dtype: int64
----------------------------------------------------------------------------------------------------
a         2.718282
b         7.389056
c    162754.791419
d        54.598150
e         0.006738
dtype: float64


In [4]:
# one can create pandas series using dict

index_value = ['a','c','d','b']
values = [1,2,3,4]

dict_value = dict(zip(index_value,values))
print(dict_value,end=end)

pd_series=pd.Series(dict_value)
print(pd_series,end=end)

pd_series=pd.Series(dict_value,index = index_value+['z']) # when new index is passed to index option, it will create nan value as we dont have any value for that index
print(pd_series,end=end)

{'a': 1, 'c': 2, 'd': 3, 'b': 4}
----------------------------------------------------------------------------------------------------
a    1
c    2
d    3
b    4
dtype: int64
----------------------------------------------------------------------------------------------------
a    1.0
c    2.0
d    3.0
b    4.0
z    NaN
dtype: float64
----------------------------------------------------------------------------------------------------


In [5]:
# The isnull and notnull functions in pandas should be used to detect missing data:
print(pd.isna(pd_series),end=end) # will return true for na values 
print(pd.notna(pd_series),end=end) # will return false for na values

# alternatively

print(pd_series.isna(),
pd_series.notna(),sep=end)

a    False
c    False
d    False
b    False
z     True
dtype: bool
----------------------------------------------------------------------------------------------------
a     True
c     True
d     True
b     True
z    False
dtype: bool
----------------------------------------------------------------------------------------------------
a    False
c    False
d    False
b    False
z     True
dtype: bool
----------------------------------------------------------------------------------------------------
a     True
c     True
d     True
b     True
z    False
dtype: bool


In [6]:
# A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:

index_value_1 = list('abcd')
index_value_2 = list('bcde')

values = [1,2,3,4]

dict_value_1 = dict(zip(index_value_1,values))
dict_value_2 = dict(zip(index_value_2,values))

print(dict_value_1,dict_value_2,sep=end,end=end)

pd_series_1 = pd.Series(dict_value_1)
pd_series_2 = pd.Series(dict_value_2)

print(pd_series_1,pd_series_2,sep=end,end=end)

print(pd_series_1 + pd_series_2,end = end)
print(pd_series_1 - pd_series_2,end = end)

{'a': 1, 'b': 2, 'c': 3, 'd': 4}
----------------------------------------------------------------------------------------------------
{'b': 1, 'c': 2, 'd': 3, 'e': 4}
----------------------------------------------------------------------------------------------------
a    1
b    2
c    3
d    4
dtype: int64
----------------------------------------------------------------------------------------------------
b    1
c    2
d    3
e    4
dtype: int64
----------------------------------------------------------------------------------------------------
a    NaN
b    3.0
c    5.0
d    7.0
e    NaN
dtype: float64
----------------------------------------------------------------------------------------------------
a    NaN
b    1.0
c    1.0
d    1.0
e    NaN
dtype: float64
----------------------------------------------------------------------------------------------------


In [7]:
# Both the Series object itself and its index have a name attribute, which integrates with other key areas of pandas functionality
pd_series.name = 'values'
pd_series.index.name = 'index'
print(pd_series)


index
a    1.0
c    2.0
d    3.0
b    4.0
z    NaN
Name: values, dtype: float64


In [8]:
# A Series’s index can be altered in-place by assignment:
pd_series=pd.Series([1,2,3,4])
print(pd_series,end=end)

pd_series.index = list('abcd')
print(pd_series,end=end)

# pd_series.index = list('abcde') # error
# print(pd_series,end=end)

0    1
1    2
2    3
3    4
dtype: int64
----------------------------------------------------------------------------------------------------
a    1
b    2
c    3
d    4
dtype: int64
----------------------------------------------------------------------------------------------------


In [9]:
# DataFrame

# A DataFrame represents a rectangular table of data and contains an ordered collec‐ tion of columns, each of which can be a different value type (numeric, string, boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index. Under the hood, the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays.

# There are many ways to construct a DataFrame, though one of the most common is from a dict of equal-length lists or NumPy arrays:

state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada']
year = [2000, 2001, 2002, 2001, 2002, 2003]
pop = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
column_names = ['state','year','pop']

dict_value = dict(zip(column_names,[state,year,pop]))
print(dict_value,end=end)

pd_df = pd.DataFrame(dict_value)
pd_df

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
----------------------------------------------------------------------------------------------------


Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [10]:
pd_df.head() # to get first 5 rows

# to change the column order, specify in index option

pd.DataFrame(dict_value,columns=['state','pop','year'])

Unnamed: 0,state,pop,year
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Ohio,3.6,2002
3,Nevada,2.4,2001
4,Nevada,2.9,2002
5,Nevada,3.2,2003


In [11]:
# If you pass a column that isn’t contained in the dict, it will appear with missing values in the result:
pd_df = pd.DataFrame(dict_value,columns=pd_df.columns.to_list() + ['dept'])
print(pd_df,end=end)
print(pd_df.columns)

    state  year  pop dept
0    Ohio  2000  1.5  NaN
1    Ohio  2001  1.7  NaN
2    Ohio  2002  3.6  NaN
3  Nevada  2001  2.4  NaN
4  Nevada  2002  2.9  NaN
5  Nevada  2003  3.2  NaN
----------------------------------------------------------------------------------------------------
Index(['state', 'year', 'pop', 'dept'], dtype='object')


In [12]:
# single column can be accessed as pd.Series
print(pd_df['year'],end = end)
print(pd_df.year,end = end)

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64
----------------------------------------------------------------------------------------------------
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64
----------------------------------------------------------------------------------------------------


In [13]:
# Columns can be modified by assignment.
# When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame.

pd_df['dept'] = 100
print(pd_df,end=end)

pd_df['dept'] = np.arange(6,dtype=float)
print(pd_df,end=end)

    state  year  pop  dept
0    Ohio  2000  1.5   100
1    Ohio  2001  1.7   100
2    Ohio  2002  3.6   100
3  Nevada  2001  2.4   100
4  Nevada  2002  2.9   100
5  Nevada  2003  3.2   100
----------------------------------------------------------------------------------------------------
    state  year  pop  dept
0    Ohio  2000  1.5   0.0
1    Ohio  2001  1.7   1.0
2    Ohio  2002  3.6   2.0
3  Nevada  2001  2.4   3.0
4  Nevada  2002  2.9   4.0
5  Nevada  2003  3.2   5.0
----------------------------------------------------------------------------------------------------


In [14]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes:
pd_df.index = ['one', 'two', 'three', 'four','five', 'six']
# pd_df 

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

pd_df['dept'] = val
pd_df

Unnamed: 0,state,year,pop,dept
one,Ohio,2000,1.5,
two,Ohio,2001,1.7,-1.2
three,Ohio,2002,3.6,
four,Nevada,2001,2.4,-1.5
five,Nevada,2002,2.9,-1.7
six,Nevada,2003,3.2,


In [15]:
# Assigning a column that doesn’t exist will create a new column. The del keyword will delete columns as with a dict.
pd_df['eval'] = pd_df['state'] == 'Ohio'
print(pd_df,end=end)

del pd_df['eval']
print(pd_df,end=end)

        state  year  pop  dept   eval
one      Ohio  2000  1.5   NaN   True
two      Ohio  2001  1.7  -1.2   True
three    Ohio  2002  3.6   NaN   True
four   Nevada  2001  2.4  -1.5  False
five   Nevada  2002  2.9  -1.7  False
six    Nevada  2003  3.2   NaN  False
----------------------------------------------------------------------------------------------------
        state  year  pop  dept
one      Ohio  2000  1.5   NaN
two      Ohio  2001  1.7  -1.2
three    Ohio  2002  3.6   NaN
four   Nevada  2001  2.4  -1.5
five   Nevada  2002  2.9  -1.7
six    Nevada  2003  3.2   NaN
----------------------------------------------------------------------------------------------------


In [16]:
# Another common form of data is a nested dict of dicts. If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys as the columns and the inner keys as the row indices:

state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada']
year = [2000, 2001, 2002, 2001, 2002, 2003]
pop = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
column_names = ['state','year','pop']

dict_value = {i:{m:n for l,m,n in zip(state,year,pop) if l == i} for i,j,k in zip(state,year,pop)}
pd_df = pd.DataFrame(dict_value)
pd_df

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [17]:
# You can transpose the DataFrame (swap rows and columns) with similar syntax to a NumPy array:
pd_df.T

Unnamed: 0,2000,2001,2002,2003
Ohio,1.5,1.7,3.6,
Nevada,,2.4,2.9,3.2


In [18]:
# If a DataFrame’s index and columns have their name attributes set, these will also be displayed:
pd_df.index.name = 'index_name'
pd_df.columns.name = "column_name"
pd_df

column_name,Ohio,Nevada
index_name,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [19]:
# As with Series, the values attribute returns the data contained in the DataFrame as a two-dimensional ndarray:
pd_df.values

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9],
       [nan, 3.2]])

In [20]:
# different ways to create a data frame
# Creating an Empty DataFrame

# Importing Pandas to create DataFrame
import pandas as pd

# Creating Empty DataFrame and Storing it in variable df
df = pd.DataFrame()

# Printing Empty DataFrame
print(df)

Empty DataFrame
Columns: []
Index: []


In [21]:
# Creating  Dataframe from Lists

# Import pandas library
import pandas as pd

# initialize list elements
data = [10,20,30,40,50,60]

# Create the pandas DataFrame with column name is provided explicitly
df = pd.DataFrame(data, columns=['Numbers'])

# print dataframe.
df

Unnamed: 0,Numbers
0,10
1,20
2,30
3,40
4,50
5,60


In [22]:
# Creating Pandas DataFrame from lists of lists.

# Import pandas library
import pandas as pd

# initialize list of lists
data = [['tom', 10], ['nick', 15], ['juli', 14]]

# Create the pandas DataFrame
df = pd.DataFrame(data, columns=['Name', 'Age'])

# print dataframe.
df

Unnamed: 0,Name,Age
0,tom,10
1,nick,15
2,juli,14


In [23]:
# Creating DataFrame from dict of narray/lists

import pandas as pd
  
# initialize data of lists.
data = {'Name': ['Tom', 'nick', 'krish', 'jack'],
        'Age': [20, 21, 19, 18]}
  
# Create DataFrame
df = pd.DataFrame(data)
  
# Print the output.
df

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


In [24]:
# Creating Dataframe from list of dicts

# Python code demonstrate how to create
# Pandas DataFrame by lists of dicts.
import pandas as pd

# Initialize data to lists.
data = [{'a': 1, 'b': 2, 'c': 3},
        {'a': 10, 'b': 20, 'c': 30}]

# Creates DataFrame.
df = pd.DataFrame(data)

# Print the data
df

Unnamed: 0,a,b,c
0,1,2,3
1,10,20,30


In [25]:
# Index Objects

# pandas’s Index objects are responsible for holding the axis labels and other metadata (like the axis name or names). Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index:

