# pandas
* 'pandas' is important in the field of data science as it contains data manipulation and data structure tools, that are used for fast data cleaning and analysis.
* pandas is used mostly along with numerical computing tools such as NumPy and SciPy, data visualization tools such as matplotlib and analytical libraries such as Sci-kit learn, statsmodels
* pandas is designed to work with tabular and heterogeneous data (unlike NumPy which is best suited for homogeneous numerical array data

In [1]:
import pandas as pd
import numpy as np
# pandas has two important data structures known as "Series" & "DataFrame", and are commonly used.


### Series

In [2]:
# Series in pandas is a one-dimensional array-like object, that contains, sequence of values and
# associated array of data labels called its index

exm = pd.Series([1, 4, 6, 7])
print(exm)

print(exm.values)  # a 'Series_objname.values' gives the values of the obj (array values)
print(exm.index)   # a 'Series_objname.index' gives the index information of the array

# thus using this you can also change the index of the associated sequence:

obj = pd.Series([3, 6, 7, 2], index = ['a', 'b', 'c', 'd'])
print(obj)
print(obj.index)

# to select single element from the sequence you can use the associated index 

print(exm[3], obj['d'])

# you can also use a sequence of indexes together to get the elements

print(obj[['a', 'c', 'd']])  # since it is a sequence you have to put them inside the " [] "

0    1
1    4
2    6
3    7
dtype: int64
[1 4 6 7]
RangeIndex(start=0, stop=4, step=1)
a    3
b    6
c    7
d    2
dtype: int64
Index(['a', 'b', 'c', 'd'], dtype='object')
7 2
a    3
c    7
d    2
dtype: int64


In [3]:
# if you use mathematical numpy or numpy-like operations, the indexes will remain the same
print(obj)
print(obj*2)

a    3
b    6
c    7
d    2
dtype: int64
a     6
b    12
c    14
d     4
dtype: int64


In [4]:
# it can be considered as a structured dictionary, as it contains indexs and values (similar to
# to keys and values). Infact you can pass a dictionary in Series to create a series array.

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
data1 = pd.Series(sdata)
print(data1)

# you can perform dict like actions on Series, such as,

print('California' in sdata)

# while passing a dictionary in series, the dict's keys order is stored in Series. This can be 
# changed explicitely by passing keys in Series as a list, after the dict

skeys = ['California', 'Ohio', 'Oregon', 'Texas']
data2 = pd.Series(sdata, index = skeys)
print(data2)

# since the Dictionary does not contain any value for 'California' it shows as 'NaN' (which means
# 'Not a Number'). Nan in pandas is used to mark missing objects.

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
False
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [5]:
# to find the missing data in pandas, 'isnull' & 'notnull' functions are used

pd.isnull(data2)

# the series object also contains the isnull & notnull instance methods, thus we can also write it as:

data2.isnull()

# an important feature of pandas is that it aligns by same indexes during mathematical operations
# such as: 

data1 + data2

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [6]:
# the series object and its index have a name attribute, which is useful now and then

data2.name = 'Population'
data2.index.name = 'States:'
data2

# you can also change the index of a series directly using the assignment ' = '
# although the length of indexes must be same as the object index.

States:
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

### DataFrame

In [7]:
# unlike series where there are only one rows of corresponding indexes for the elements 
# in a Dataframe there are two idexes for a element in the form of rows(index) and columns(index)

# there are many ways to create a DataFrame but one of the most common is to create a dictionary
# of equal length lists or numpyArrays

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
# like the series the index will be assigned automatically if not explicitely mentioned
frame = pd.DataFrame(data) 

frame.head() # the 'head()' method of DataFrame_object selects only first five rows

# if you want to specify or modify the sequence of columns in the frame
frame = pd.DataFrame(data, columns = ['year', 'state', 'pop'])
# you can also change the index the same way (like in Series)

# if you add a column to the frame that has no values, it shows as 'Nan'
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])
frame2.index = ['one', 'two', 'three', 'four', 'five', 'six']


frame['state']   # you can retrieve columns in DataFrame as dict-like or passing as a method*
frame.state   # *this is only useful when used in Jupyter

frame.loc[3]  # you can also retrieve a row, using the attribute 'loc[row_name(index)]'

year       2001
state    Nevada
pop         2.4
Name: 3, dtype: object

In [8]:
# column values can be modified using assigment statement, with a scalar value or an array
# frame2['debt'] = 16
frame2['debt'] = np.arange(6.0)
frame2

# if you are assigning 'Lists' or 'Arrays' to a column in the dataframe, it must be of the same 
# length as the dataframe.

# if you are assigning 'Series' to a column, it's labels will be realigned exactly to the 
# DataFrame's indexes. Any missing value will be shown as 'NaN'

val = pd.Series([1.2, 4.6, 7.2], index = ['two', 'four', 'six'])
frame2['debt'] = val
frame2


# if you assign a new column which is not present in the dataframe, it will create a new column

frame2['eastern'] = frame2['state'] == 'Ohio'
frame2.index.name = 'S.No'; frame2.columns.name = 'States'

# just like a dictionary the del keyword is used to delete the columns

del frame2['eastern']
frame2.values  # like Series 'dataframe_obj.values' returns a two-dimensional array of data
frame2



States,year,state,pop,debt
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,4.6
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,7.2


In [10]:
# if nested Dicts are passed in Pandas dataframe, pandas interpret the outer dict as the columns 
# and the inner dicts as the row indexes and values

### INDEX OBJECTS

In [35]:
# these index objects are reponsible for turning any labels, or a sequence passed while contructing 
# a Series or dataframe into indices (index)
# These indices are 'immutable'

ind = pd.Index([1, 2, 3])
dfobj = pd.Series(['kelly', 'molly', 'sammy'], index = ind)
dfobj

# in Pandas there can be duplicate index labels unlike a Set
# When a particular index has been called, all the occurrances of that index will be returned
# if there are more of the same index

indexer = pd.Index(np.arange(10))
indexer.append(pd.Index([10]))  # index_obj.append() only accepts other indexes as parameters
indexer.difference([2, 4, 6, 7, 4])  # compares two index arrays like a set
indexer.intersection([2, 4, 6, 7])  # finds the common in passed two array of indexes
indexer.union([11, 21, 101, 1000])  # combines two array of indexes together

indexer.union([11, 21, 101, 1000]).delete(-1)
indexer.isin([9])  # only list like objects can be passed inside 'isin'
indexer.insert(0, 0.01) # insert method inserts an index using the index number

Float64Index([0.01, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], dtype='float64')

In [44]:
# we can change the index of a series or dataframe using the 'reindex' function
# in reindex, to change the indedx of a Series or Dataframe you have to create a new object

obj1 = pd.Series(['1', 4, 5, '6.6', 4.2], index = ['a', 'c', 'e', 'b', 'd'])
obj1    # when you are passing index in Series method of dataframe, it has to be of same length as
# as that of the values
    
obj2 = obj1.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'e'])
obj2  # while reindexing, you can rearrange the indices as you like, any new index with no associated 
# value will be shown as 'Nan'
# if you add an two similar indiced using 'reindex', the value of that original index will be given
# to all the occurences of that index (in the above example:  'e')



a      1
b    6.6
c      4
d    4.2
e      5
f    NaN
e      5
dtype: object

### ffill attribute in reindex method

In [66]:
# you can use the ffill attribute (forward-fills) to fill in the missing values for extra indices 
# when passed in the reindex method

obj3 = pd.Series(['a', 'b', 'c'], index = [0, 4, 6])
obj3.reindex(range(10), method = 'ffill')  # ffill fills in the values that occurs above the missing
# indexed values

# reindex can manipulate rows as well as columns
# if only a sequene has been passed to reindex, it changes the rows by default

dfobj = pd.DataFrame(np.arange(10).reshape((2,5)), index = ['first', 'second'], columns = 
                    ['a', 'b', 'c', 'd', 'e'])
dfobj

frame = dfobj.reindex(['first', 'second', 'third', 'fourth'])
alpha = [33, 44, 66, 77, 88, 'a']
frame = dfobj.reindex(columns = alpha, fill_value = 0)  # columns are changes in reindex using the columns attribute
frame

# ATTRIBUTES : 'fill_value = int/string' is used to give default values to missing values (NaN)
#            : 
# keep in mind that the columns that are passed in the reindex are only returned. It is the same
# with the rows

Unnamed: 0,33,44,66,77,88,a
first,0,0,0,0,0,0
second,0,0,0,0,0,5


### Deleting elements from a dataframe

In [77]:
# drop method returns an object of the dataframe without the value or values that are passed in the 
# drop method on a given axis (either axis = 0 , which is rows or , axis = 1, which is columns)

df = pd.DataFrame(np.arange(16).reshape((4,4)), index = ['one','two', 'three', 'four'],
                 columns = ['une', 'deux', 'trois', 'quatre'])
newdf = df.drop(['three', 'four']) # if you pass any values without mentioning the axis, the rows,
# that is axis zero is the default axis

# to drop from the columns, you used 'axis = 1' or 'axis = "columns"'

newdfc = df.drop(['trois', 'quatre'], axis = 1)
newdfc

# drop method can deleted the values passed in it directly from the original dataframe object using
# the 'inplace' attribute. Beware, this permanentely deletes the value from the df

df.drop(['one', 'two'], inplace = True)
df

Unnamed: 0,une,deux,trois,quatre
three,8,9,10,11
four,12,13,14,15


### 