In [1]:
import numpy as np
import pandas as pd


----


## The Pandas DataFrame Object

A `DataFrame` represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). The `DataFrame` has both a row and column index; it can be thought of as a dict of Series all sharing the same index. Under the hood, the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays.



### Creating DataFrame objects

A Pandas ``DataFrame`` can be constructed in a variety of ways.

```python
pandas.DataFrame(data=None, index=None, 
                 columns=None, dtype=None, copy=None)
```
Documentation Link For [pandas.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

There are many ways to construct a DataFrame, though one of the most common is from a dict of equal-length lists or NumPy arrays:

In [2]:
# Create Dictionary with keys State, Population & Area
# Values are passed as list
# Each Key will be a column in DataFrame
# ! Important: Number of elements should be same for all the keys

population ={'State' : ['California','Texas','New York','Florida','Illinois'],
             'Population': [38332521,26448193,19651127,19552860,12882135],
             'Area' : [423967, 695662,141297,170312,149995]}

# Create DataFrame using the Dictionary Created
pd_population = pd.DataFrame(population)

# Print the DataFrame
print(pd_population)

        State  Population    Area
0  California    38332521  423967
1       Texas    26448193  695662
2    New York    19651127  141297
3     Florida    19552860  170312
4    Illinois    12882135  149995


In [3]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})

pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})

data = pd.DataFrame({'area':area, 'pop':pop})
print(data)

              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135


Even if some keys in the dictionary are missing, Pandas will fill them in with ``NaN`` (i.e., "not a number") values:

In [4]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [5]:
data = {'State': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'Year': [2000, 2001, 2002, 2001, 2002, 2003],
        'Population': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

df_population = pd.DataFrame(data)

print(df_population)

    State  Year  Population
0    Ohio  2000         1.5
1    Ohio  2001         1.7
2    Ohio  2002         3.6
3  Nevada  2001         2.4
4  Nevada  2002         2.9
5  Nevada  2003         3.2


If need to order the columns of the DataFrame in a specific order, the order of the column is passed as parameter `columns`. DataFrame Columns in specific order

In [11]:
df_population2 = pd.DataFrame(data, columns=['Year', 'State', 'Population'])
print(df_population2)

   Year   State  Population
0  2000    Ohio         1.5
1  2001    Ohio         1.7
2  2002    Ohio         3.6
3  2001  Nevada         2.4
4  2002  Nevada         2.9
5  2003  Nevada         3.2


If additonal column is passed that is not in the dictnary, it will appear with missing values in the result.

In [7]:
indx = pd.Index(['a','b','c','d','e','f'])
df_popupation3=pd.DataFrame(data, columns=['Year', 'State', 'Population', 'Debt'], index=indx )
print(df_popupation3)

   Year   State  Population Debt
a  2000    Ohio         1.5  NaN
b  2001    Ohio         1.7  NaN
c  2002    Ohio         3.6  NaN
d  2001  Nevada         2.4  NaN
e  2002  Nevada         2.9  NaN
f  2003  Nevada         3.2  NaN


In [8]:
# Update the column with a specific value

df_popupation3['Debt']=9.2
df_popupation3

Unnamed: 0,Year,State,Population,Debt
a,2000,Ohio,1.5,9.2
b,2001,Ohio,1.7,9.2
c,2002,Ohio,3.6,9.2
d,2001,Nevada,2.4,9.2
e,2002,Nevada,2.9,9.2
f,2003,Nevada,3.2,9.2


Assigning a value to an non existent column will create the column in the dataframe. 

**Note:** New columns cannot be created with the frame2.eastern syntax

In [12]:
df_popupation3['eastern']=df_population2['State']=='Ohio'
print(df_popupation3)

   Year   State  Population  Debt eastern
a  2000    Ohio         1.5   9.2     NaN
b  2001    Ohio         1.7   9.2     NaN
c  2002    Ohio         3.6   9.2     NaN
d  2001  Nevada         2.4   9.2     NaN
e  2002  Nevada         2.9   9.2     NaN
f  2003  Nevada         3.2   9.2     NaN


When you are assigning lists or arrays to a column, the value’s length must match the
length of the DataFrame. If you assign a Series, its labels will be realigned exactly to
the `DataFrame` index, if there are any differences `NaN` is updated for those mismatching or missing indexes.

In [None]:
# We can observe the 'NaN' values are inserted for the missing index references
new_col = pd.Series([0.1, 0.2, 0.5], index=['a', 'c', 'e'])

df_popupation3['pop']=new_col
print(df_popupation3)

   Year   State  Population  Debt eastern  pop
a  2000    Ohio         1.5   9.2     NaN  0.1
b  2001    Ohio         1.7   9.2     NaN  NaN
c  2002    Ohio         3.6   9.2     NaN  0.2
d  2001  Nevada         2.4   9.2     NaN  NaN
e  2002  Nevada         2.9   9.2     NaN  0.5
f  2003  Nevada         3.2   9.2     NaN  NaN


Delete the column using `del` command

In [None]:
del df_popupation3['eastern']
print(df_popupation3)

   Year   State  Population  Debt  pop
a  2000    Ohio         1.5   9.2  0.1
b  2001    Ohio         1.7   9.2  NaN
c  2002    Ohio         3.6   9.2  0.2
d  2001  Nevada         2.4   9.2  NaN
e  2002  Nevada         2.9   9.2  0.5
f  2003  Nevada         3.2   9.2  NaN


A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute

In [None]:
df_popupation3['State']

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

The above command also can be used as Attribute like access and tab completion for column names. However, it is not recommended way to do it.

Though this is a useful shorthand, keep in mind that it does not work for all cases!.

In [None]:
print(df_popupation3.State is df_popupation3['State'])

True


In [None]:
# Observe the return result, False
df_popupation3.pop is df_popupation3['pop']

False

Do expect the output as above?. Sure No, The following is the explanation for this behavior,

* If the column names are not strings, or if the column names conflict with methods of the ``DataFrame``, this attribute-style access is not possible.

* The ``DataFrame`` has a ``pop()`` method, so ``df_popupation3.pop`` will point to this rather than the ``"pop"`` column:

`df[column]` works for any column name, but `df.column` only works when the column name is a valid Python variable name.


----


### Indexing and Slicing of DataFrames

Passing a single "index" to a ``DataFrame`` accesses a column

In [None]:
print(df_popupation3)

   Year   State  Population  Debt  pop
a  2000    Ohio         1.5   9.2  0.1
b  2001    Ohio         1.7   9.2  NaN
c  2002    Ohio         3.6   9.2  0.2
d  2001  Nevada         2.4   9.2  NaN
e  2002  Nevada         2.9   9.2  0.5
f  2003  Nevada         3.2   9.2  NaN


In [None]:
# Display the Index 0 as ndarray

print(df_popupation3.values[0])
print(type(df_popupation3.values[0]))

[2000 'Ohio' 1.5 9.2 0.1]
<class 'numpy.ndarray'>


In [None]:
# Slicing by explict Index
# Note: Result include the limiting range also.

print(df_popupation3['a':'c'])

   Year State  Population  Debt  pop
a  2000  Ohio         1.5   9.2  0.1
b  2001  Ohio         1.7   9.2  NaN
c  2002  Ohio         3.6   9.2  0.2


In [None]:
# Slicing by implicit Integer Index

print(df_popupation3[0:3])

   Year State  Population  Debt  pop
a  2000  Ohio         1.5   9.2  0.1
b  2001  Ohio         1.7   9.2  NaN
c  2002  Ohio         3.6   9.2  0.2


In [None]:
# Masking the results

df_popupation3[(df_popupation3['Population']>2.0) & (df_popupation3['Population']<3.0)]

Unnamed: 0,Year,State,Population,Debt,pop
d,2001,Nevada,2.4,9.2,
e,2002,Nevada,2.9,9.2,0.5


These slicing and indexing conventions can be a source of confusion. For example, if your `Series` or `DataFrame` has an explicit integer index, an indexing operation such as `data[1]` will use the explicit indices, while a slicing operation like `data[1:3]` will use the implicit
Python-style index.

Thus for array-style indexing, we need another convention. Here Pandas again uses the ``loc`` and ``iloc`` indexers mentioned earlier.

Using the ``iloc`` indexer, we can index the underlying array as if it is a simple NumPy array (using the implicit Python-style index), but the ``DataFrame`` index and column labels are maintained in the result.

Similarly, using the ``loc`` indexer we can index the underlying data in an array-like style but using the explicit index and column names.

In [None]:
df_popupation3.iloc[:4, :4]

Unnamed: 0,Year,State,Population,Debt
a,2000,Ohio,1.5,9.2
b,2001,Ohio,1.7,9.2
c,2002,Ohio,3.6,9.2
d,2001,Nevada,2.4,9.2


In [None]:
df_popupation3.loc[:'d', :'Debt']

Unnamed: 0,Year,State,Population,Debt
a,2000,Ohio,1.5,9.2
b,2001,Ohio,1.7,9.2
c,2002,Ohio,3.6,9.2
d,2001,Nevada,2.4,9.2


Any of the familiar NumPy-style data access patterns can be used within these indexers.

For example, in the ``loc`` indexer we can combine masking and fancy indexing as in the following:

In [None]:
df_popupation3.loc[df_popupation3['Population'] > 2, ['State','Year', 'Population']]

Unnamed: 0,State,Year,Population
c,Ohio,2002,3.6
d,Nevada,2001,2.4
e,Nevada,2002,2.9
f,Nevada,2003,3.2


Any of these indexing conventions may also be used to set or modify values; this is done in the standard way that you might be accustomed to from working with NumPy:

In [None]:
# Original State
print("\nBefore Updation")
print(df_popupation3)

# Update the records which has NaN
df_popupation3.loc[['b','d','f'], ['pop']] = 0.9

# After Updation
print("\nAfter Updation")
print(df_popupation3)
#data.iloc[0, 2] = 90
#data


Before Updation
   Year   State  Population  Debt  pop
a  2000    Ohio         1.5   9.2  0.1
b  2001    Ohio         1.7   9.2  0.9
c  2002    Ohio         3.6   9.2  0.2
d  2001  Nevada         2.4   9.2  0.9
e  2002  Nevada         2.9   9.2  0.5
f  2003  Nevada         3.2   9.2  0.9

After Updation
   Year   State  Population  Debt  pop
a  2000    Ohio         1.5   9.2  0.1
b  2001    Ohio         1.7   9.2  0.9
c  2002    Ohio         3.6   9.2  0.2
d  2001  Nevada         2.4   9.2  0.9
e  2002  Nevada         2.9   9.2  0.5
f  2003  Nevada         3.2   9.2  0.9


### Additional indexing conventions

There are a couple extra indexing conventions that might seem at odds with the preceding discussion, but nevertheless can be very useful in practice.
First, while *indexing* refers to columns, *slicing* refers to rows:

In [None]:
df_popupation3['b':'d']

Unnamed: 0,Year,State,Population,Debt,pop
b,2001,Ohio,1.7,9.2,0.9
c,2002,Ohio,3.6,9.2,0.2
d,2001,Nevada,2.4,9.2,0.9


### Selection with iloc and Loc

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [None]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [None]:
data[data<5]

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,,,
Utah,,,,
New York,,,,


In [None]:
data.iloc[0]

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int32

In [None]:
data.loc['Ohio']

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int32

In [None]:
data.iloc[1, [1,3]]

two     5
four    7
Name: Colorado, dtype: int32

In [None]:
data.loc['Colorado', ['two','four']]

two     5
four    7
Name: Colorado, dtype: int32

In [None]:
data.loc[:'Utah', ['one','three']]

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10


In [None]:
data.iloc[:, :3:2]

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [None]:
data.iloc[:, :3:2][data['three']>5]

Unnamed: 0,one,three
Colorado,4,6
Utah,8,10
New York,12,14


### Arithmetic and Data Alignment

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [None]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [None]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


### Arithmetic methods with fill values

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df1
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [None]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [None]:
df1.radd(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [None]:
df1-df2

Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,
1,-1.0,,-1.0,-1.0,
2,-2.0,-2.0,-2.0,-2.0,
3,,,,,


In [None]:
df1.sub(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,-4.0
1,-1.0,5.0,-1.0,-1.0,-9.0
2,-2.0,-2.0,-2.0,-2.0,-14.0
3,-15.0,-16.0,-17.0,-18.0,-19.0


In [None]:
df1.rsub(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,4.0
1,1.0,-5.0,1.0,1.0,9.0
2,2.0,2.0,2.0,2.0,14.0
3,15.0,16.0,17.0,18.0,19.0



----



### Task 1 - Create a Function to Create a DataFrame

In [None]:
def gen_dataframe(ncol, nrow, maxrand=10):
    myrow = ["R"+str(x) for x in range(1, nrow+1)]
    mycol = ["C"+str(x) for x in range(1, ncol+1)]
    arr = np.random.randint(1, maxrand , (nrow, ncol))
    return pd.DataFrame(arr, index=myrow, columns=mycol)

In [None]:
df = gen_dataframe(10, 5)
df

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10
R1,7,8,8,9,2,6,9,5,4,1
R2,4,6,1,3,4,9,2,4,4,4
R3,8,1,2,1,5,8,4,3,8,3
R4,1,1,5,6,6,7,9,5,2,5
R5,9,2,2,8,4,7,8,3,1,4


### Append Mean as new row and col

In [None]:
# Demo to explain how to insert a new row and column
df.loc["Col_Mean"] = df.mean()
df["Row_Mean"] = df.mean(axis=1)
df

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,Row_Mean
R1,7.0,8.0,8.0,9.0,2.0,6.0,9.0,5.0,4.0,1.0,5.9
R2,4.0,6.0,1.0,3.0,4.0,9.0,2.0,4.0,4.0,4.0,4.1
R3,8.0,1.0,2.0,1.0,5.0,8.0,4.0,3.0,8.0,3.0,4.3
R4,1.0,1.0,5.0,6.0,6.0,7.0,9.0,5.0,2.0,5.0,4.7
R5,9.0,2.0,2.0,8.0,4.0,7.0,8.0,3.0,1.0,4.0,4.8
Col_Mean,5.8,3.6,3.6,5.4,4.2,7.4,6.4,4.0,3.8,3.4,4.76



----

