# Pandas:
Contains data structures and data manipulation tools designed to make data cleaning and analysis fast and easy.
Pandas is designed for working in tabular or heterogeneous data, whereas numpy is suited for working with homogenous numerical array data.


In [5]:
import pandas as pd
import numpy as np

#from now, wherever pd appears, it refers to pandas.  and whole module is being imported. 
from pandas import DataFrame, Series  #easier to import into local namespace. only series and data frames are being imported

In [6]:
# Pandas data structure
#1. series : one-dimensional array like object containing a sequence of values and an associated array of data labels called index. 
obj=pd.Series([4,7,-5,8])
print(obj)
#it gives index on left and values on the right. , from 0 to n -1 in default. 
# we can get array representation  and index object of the series via its values and index attributes respectively.

print(obj.values)
print(obj.index)

0    4
1    7
2   -5
3    8
dtype: int64
[ 4  7 -5  8]
RangeIndex(start=0, stop=4, step=1)


In [7]:
# It is good to create the series with an index identifying each data point with a label.
obj2=pd.Series([1,2,3,4],index=("a","b","c","d"))
print(obj2)

print(obj2.index)

#compared with numpy arrays, we can label in the index when selecting single values or set of values.
obj2[["a","b","c"]] 
# here is ["a","b","C"] is intepreted as list of indices, even though it contains strings instead of integers.

a    1
b    2
c    3
d    4
dtype: int64
Index(['a', 'b', 'c', 'd'], dtype='object')


a    1
b    2
c    3
dtype: int64

In [8]:
# Series is a fixed-length, ordered dict as it is a mapping of index values to the data values. 
# we can create series form by passing dictionary. 

data={"alexa":3000.,"siri":2000,"gemini":12000}
obj3=pd.Series(data)
print(obj3)
# dict keys are being index in resulting series. 

machines={"grok","alexa","claude"}
obj4=pd.Series(data,index=machines)
print(obj4)

# one value is found in data placed in appropriate location, returned NaN (not a number) which is considered in pandas for missing values.  


alexa      3000.0
siri       2000.0
gemini    12000.0
dtype: float64
alexa     3000.0
grok         NaN
claude       NaN
dtype: float64


In [9]:
# the isnull and not null function in pandas should be used to detect missing data.:
pd.isnull(obj4) #marks true for missing null values and false for valid for non-missing data.
pd.notnull(obj4) #marks true for non missing data and marks False for missing data

alexa      True
grok      False
claude    False
dtype: bool

In [10]:
#Series also have instance methods: functions that are defined inside class, They have access to instances attributes and can modify the state of that specific instance.
obj4.isnull()

alexa     False
grok       True
claude     True
dtype: bool

In [11]:
# Series features automatically aligns by index label in airthmetic operations.
obj3+obj4

alexa     6000.0
claude       NaN
gemini       NaN
grok         NaN
siri         NaN
dtype: float64

In [12]:
# Series object and index have objects(value) have name attribute, which integrates with other key areas of pandas functionality.
obj4.name="Users"
obj4.index.name="Device"
print(obj4)

# device is index name and Users is the series name or column name. 

Device
alexa     3000.0
grok         NaN
claude       NaN
Name: Users, dtype: float64


In [13]:
# series index can be altered in place by assignments
obj
obj.index=["gita","Ram","Yamada","galilei"]
obj

gita       4
Ram        7
Yamada    -5
galilei    8
dtype: int64

## Data Frame in Pandas: 
Represents a rectangular table of data and contains an ordered collection of columnns, each of which can be different value type (numeric,string, boolean,etc). It has both row and column index; it can be taken as a dict of series all sharing same index.  The data is stored as one or more two-dimensional blocks rather than a list, dict or some other collection of one dimensional array. 

In [14]:
data={
    "city":["osaka","osaka","osaka","kyoto","kyoto","kyoto"],
    "year":[2000,2005,2010,2000,2005,2010],
    "population":[2700000, 2750000, 2800000, 1500000, 1550000, 1600000]
}
frame=pd.DataFrame(data)
print(frame) #Dataframe will have its index assigned automatically as with Series, and the columns are placed in sorted order.
#for large dataframes, the head method selects only first five rows. 
frame.head()

    city  year  population
0  osaka  2000     2700000
1  osaka  2005     2750000
2  osaka  2010     2800000
3  kyoto  2000     1500000
4  kyoto  2005     1550000
5  kyoto  2010     1600000


Unnamed: 0,city,year,population
0,osaka,2000,2700000
1,osaka,2005,2750000
2,osaka,2010,2800000
3,kyoto,2000,1500000
4,kyoto,2005,1550000


In [15]:
# to specify the sequence of the columns, the dataframes columns can be arranged in that order.
pd.DataFrame(data,columns=["population","city","year"])

Unnamed: 0,population,city,year
0,2700000,osaka,2000
1,2750000,osaka,2005
2,2800000,osaka,2010
3,1500000,kyoto,2000
4,1550000,kyoto,2005
5,1600000,kyoto,2010


In [16]:
# if column which isnt in data is passed then NaN will appear.
frame2=pd.DataFrame(data,columns=["year","city","population","debt"],index=[1,2,3,4,5,6])
frame2

#in this way we can set index and pass columns as well in the dataframes. 
frame2.columns

Index(['year', 'city', 'population', 'debt'], dtype='object')

In [17]:
#column in dataframe can be retrieved as a series either by dict like notation or attributes.
# attributes stores info about the object.
# Methods are functions inside the object that makes it perform some action.

frame2["city"]
frame2.city

# attribute-like accesss(frame.city) only works when the column name is valid python variable name but frame2[column] works for any column name. 
# make sure to note the returned series have the same index as the Dataframe and their name attribute has beemn appropriately set.

1    osaka
2    osaka
3    osaka
4    kyoto
5    kyoto
6    kyoto
Name: city, dtype: object

In [18]:
# Rows can be retrieved by position or namme with the special loc attrribute:
frame2.loc[2]

year             2005
city            osaka
population    2750000
debt              NaN
Name: 2, dtype: object

In [19]:
# columns can be modified by assignment.
frame["debt"]=20.2 #it can also modify. 
frame2["debt"]=np.arange(6) #np is numpy func that creates an array of evebnly spaced values over a specified range. numpy.arange([start,] stop[, step], dtype=None)

frame2

Unnamed: 0,year,city,population,debt
1,2000,osaka,2700000,0
2,2005,osaka,2750000,1
3,2010,osaka,2800000,2
4,2000,kyoto,1500000,3
5,2005,kyoto,1550000,4
6,2010,kyoto,1600000,5


In [20]:
# when we are assigning list or arrays to a column, the values length must match the length of dataframe. If series is assigned, its lables will be realigned exactly to Dataframe index, inserting missing values.
val=pd.Series([-1.2,2.3,-5.7],index=(2,3,4))
frame2["debt"]=val
frame2

Unnamed: 0,year,city,population,debt
1,2000,osaka,2700000,
2,2005,osaka,2750000,-1.2
3,2010,osaka,2800000,2.3
4,2000,kyoto,1500000,-5.7
5,2005,kyoto,1550000,
6,2010,kyoto,1600000,


In [21]:
#new columns cannot be created with frame2.columns syntax.
# Assigning the column that doesnt exist will create a new column. The del keywordwill delete columns as with a dict.
frame2["foreigners"]=frame.city=="osaka"
# it returns boolean values, each row in foreigners is true if city equals osaka basically.
frame2
del frame2["foreigners"]
frame2.columns


Index(['year', 'city', 'population', 'debt'], dtype='object')

# Note
The column returned from indexing a DataFrame is a view on the underlying data, not a copy. thus, any in place modification to the series will be reflected in data frame. The column can be explicitly copied with the series's copy method. 

In [22]:
# another common form of data is nested dict of dicts:
pop={"Hiroshima": {2001:2.4,2002:2.9},
    "Nara":{2000:1.5,2001:1.8,2002:3.6}}

frame3=pd.DataFrame(pop)
frame3

#we can transpse the Dataframe (swaps rows and columns) with similar syntax to numpy array:
frame3.T

Unnamed: 0,2001,2002,2000
Hiroshima,2.4,2.9,
Nara,1.8,3.6,1.5


In [23]:
# Keys in the inner dicts are combined and soted to form the index in the results. This isnt true if an explicit index is specified.
pd.DataFrame(pop,index=[2001,2002,2003,2000])

Unnamed: 0,Hiroshima,Nara
2001,2.4,1.8
2002,2.9,3.6
2003,,
2000,,1.5


In [24]:
# Dict of series are treated in much the same way:
pdata={"Nara":frame3["Nara"][:-1],
        "Hiroshima":frame3["Hiroshima"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Nara,Hiroshima
2001,1.8,2.4
2002,3.6,2.9


In [25]:
# if a dataframe index and column have their name attributes set,
frame3.index.name="year"; frame3.columns.name="state"
frame3

state,Hiroshima,Nara
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.8
2002,2.9,3.6
2000,,1.5


In [26]:
# as with series the values attributes returns the data contained in the Dataframe as tow-dimensional array.
frame3.values

# if the dataframes's column have differnt dtypes, the dtypes values array will be chosen to accomodate all of the columns:
frame2.values

array([[2000, 'osaka', 2700000, nan],
       [2005, 'osaka', 2750000, -1.2],
       [2010, 'osaka', 2800000, 2.3],
       [2000, 'kyoto', 1500000, -5.7],
       [2005, 'kyoto', 1550000, nan],
       [2010, 'kyoto', 1600000, nan]], dtype=object)

In [31]:
# Index objects: it is responsible for holding the axis labels and other metadata. Any array or other sequence of labels we use constructing a series and data frame is internally converted to index.

obj=pd.Series(range(3), index=["a","b","c"])
index=obj.index
index

# index objects are immutable and cannot be modified by the user, which makes safer to share index objects among data structures.
labels=pd.Index(np.arange(2))
labels

obj2=pd.Series([1.5,2.4],index=labels)
obj2.index is labels

True

In [None]:
# index also behaves like fixed-size set: 
frame3
frame3.columns
"Nara" in frame3.columns

# unlike python sets, a pandas index can contain duplicate labels.
dup_lab=pd.Index(["Nara","Nara","osaka"])
dup_lab
# selection with duplicate labels will select all occurances of that label. 

Index(['Nara', 'Nara', 'osaka'], dtype='object')