# Pandas:
Contains data structures and data manipulation tools designed to make data cleaning and analysis fast and easy.
Pandas is designed for working in tabular or heterogeneous data, whereas numpy is suited for working with homogenous numerical array data.


In [1]:
import pandas as pd
import numpy as np

#from now, wherever pd appears, it refers to pandas.  and whole module is being imported. 
from pandas import DataFrame, Series  #easier to import into local namespace. only series and data frames are being imported

ModuleNotFoundError: No module named 'pandas'

In [104]:
# Pandas data structure
#1. series : one-dimensional array like object containing a sequence of values and an associated array of data labels called index. 
obj=pd.Series([4,7,-5,8])
print(obj)
#it gives index on left and values on the right. , from 0 to n -1 in default. 
# we can get array representation  and index object of the series via its values and index attributes respectively.

print(obj.values)
print(obj.index)

0    4
1    7
2   -5
3    8
dtype: int64
[ 4  7 -5  8]
RangeIndex(start=0, stop=4, step=1)


In [105]:
# It is good to create the series with an index identifying each data point with a label.
obj2=pd.Series([1,2,3,4],index=("a","b","c","d"))
print(obj2)

print(obj2.index)

#compared with numpy arrays, we can label in the index when selecting single values or set of values.
obj2[["a","b","c"]] 
# here is ["a","b","C"] is intepreted as list of indices, even though it contains strings instead of integers.

a    1
b    2
c    3
d    4
dtype: int64
Index(['a', 'b', 'c', 'd'], dtype='object')


a    1
b    2
c    3
dtype: int64

In [106]:
# Series is a fixed-length, ordered dict as it is a mapping of index values to the data values. 
# we can create series form by passing dictionary. 

data={"alexa":3000.,"siri":2000,"gemini":12000}
obj3=pd.Series(data)
print(obj3)
# dict keys are being index in resulting series. 

machines={"grok","alexa","claude"}
obj4=pd.Series(data,index=machines)
print(obj4)

# one value is found in data placed in appropriate location, returned NaN (not a number) which is considered in pandas for missing values.  


alexa      3000.0
siri       2000.0
gemini    12000.0
dtype: float64
grok         NaN
alexa     3000.0
claude       NaN
dtype: float64


In [107]:
# the isnull and not null function in pandas should be used to detect missing data.:
pd.isnull(obj4) #marks true for missing null values and false for valid for non-missing data.
pd.notnull(obj4) #marks true for non missing data and marks False for missing data

grok      False
alexa      True
claude    False
dtype: bool

In [108]:
#Series also have instance methods: functions that are defined inside class, They have access to instances attributes and can modify the state of that specific instance.
obj4.isnull()

grok       True
alexa     False
claude     True
dtype: bool

In [109]:
# Series features automatically aligns by index label in airthmetic operations.
obj3+obj4

alexa     6000.0
claude       NaN
gemini       NaN
grok         NaN
siri         NaN
dtype: float64

In [110]:
# Series object and index have objects(value) have name attribute, which integrates with other key areas of pandas functionality.
obj4.name="Users"
obj4.index.name="Device"
print(obj4)

# device is index name and Users is the series name or column name. 

Device
grok         NaN
alexa     3000.0
claude       NaN
Name: Users, dtype: float64


In [111]:
# series index can be altered in place by assignments
obj
obj.index=["gita","Ram","Yamada","galilei"]
obj

gita       4
Ram        7
Yamada    -5
galilei    8
dtype: int64

## Data Frame in Pandas: 
Represents a rectangular table of data and contains an ordered collection of columnns, each of which can be different value type (numeric,string, boolean,etc). It has both row and column index; it can be taken as a dict of series all sharing same index.  The data is stored as one or more two-dimensional blocks rather than a list, dict or some other collection of one dimensional array. 

In [112]:
data={
    "city":["osaka","osaka","osaka","kyoto","kyoto","kyoto"],
    "year":[2000,2005,2010,2000,2005,2010],
    "population":[2700000, 2750000, 2800000, 1500000, 1550000, 1600000]
}
frame=pd.DataFrame(data)
print(frame) #Dataframe will have its index assigned automatically as with Series, and the columns are placed in sorted order.
#for large dataframes, the head method selects only first five rows. 
frame.head()

    city  year  population
0  osaka  2000     2700000
1  osaka  2005     2750000
2  osaka  2010     2800000
3  kyoto  2000     1500000
4  kyoto  2005     1550000
5  kyoto  2010     1600000


Unnamed: 0,city,year,population
0,osaka,2000,2700000
1,osaka,2005,2750000
2,osaka,2010,2800000
3,kyoto,2000,1500000
4,kyoto,2005,1550000


In [113]:
# to specify the sequence of the columns, the dataframes columns can be arranged in that order.
pd.DataFrame(data,columns=["population","city","year"])

Unnamed: 0,population,city,year
0,2700000,osaka,2000
1,2750000,osaka,2005
2,2800000,osaka,2010
3,1500000,kyoto,2000
4,1550000,kyoto,2005
5,1600000,kyoto,2010


In [114]:
# if column which isnt in data is passed then NaN will appear.
frame2=pd.DataFrame(data,columns=["year","city","population","debt"],index=[1,2,3,4,5,6])
frame2

#in this way we can set index and pass columns as well in the dataframes. 
frame2.columns

Index(['year', 'city', 'population', 'debt'], dtype='object')

In [115]:
#column in dataframe can be retrieved as a series either by dict like notation or attributes.
# attributes stores info about the object.
# Methods are functions inside the object that makes it perform some action.

frame2["city"]
frame2.city

# attribute-like accesss(frame.city) only works when the column name is valid python variable name but frame2[column] works for any column name. 
# make sure to note the returned series have the same index as the Dataframe and their name attribute has beemn appropriately set.

1    osaka
2    osaka
3    osaka
4    kyoto
5    kyoto
6    kyoto
Name: city, dtype: object

In [116]:
# Rows can be retrieved by position or namme with the special loc attrribute:
frame2.loc[2]

year             2005
city            osaka
population    2750000
debt              NaN
Name: 2, dtype: object

In [117]:
# columns can be modified by assignment.
frame["debt"]=20.2 #it can also modify. 
frame2["debt"]=np.arange(6) #np is numpy func that creates an array of evebnly spaced values over a specified range. numpy.arange([start,] stop[, step], dtype=None)

frame2

Unnamed: 0,year,city,population,debt
1,2000,osaka,2700000,0
2,2005,osaka,2750000,1
3,2010,osaka,2800000,2
4,2000,kyoto,1500000,3
5,2005,kyoto,1550000,4
6,2010,kyoto,1600000,5


In [118]:
# when we are assigning list or arrays to a column, the values length must match the length of dataframe. If series is assigned, its lables will be realigned exactly to Dataframe index, inserting missing values.
val=pd.Series([-1.2,2.3,-5.7],index=(2,3,4))
frame2["debt"]=val
frame2

Unnamed: 0,year,city,population,debt
1,2000,osaka,2700000,
2,2005,osaka,2750000,-1.2
3,2010,osaka,2800000,2.3
4,2000,kyoto,1500000,-5.7
5,2005,kyoto,1550000,
6,2010,kyoto,1600000,


In [119]:
#new columns cannot be created with frame2.columns syntax.
# Assigning the column that doesnt exist will create a new column. The del keywordwill delete columns as with a dict.
frame2["foreigners"]=frame.city=="osaka"
# it returns boolean values, each row in foreigners is true if city equals osaka basically.
frame2
del frame2["foreigners"]
frame2.columns


Index(['year', 'city', 'population', 'debt'], dtype='object')

# Note
The column returned from indexing a DataFrame is a view on the underlying data, not a copy. thus, any in place modification to the series will be reflected in data frame. The column can be explicitly copied with the series's copy method. 

In [120]:
# another common form of data is nested dict of dicts:
pop={"Hiroshima": {2001:2.4,2002:2.9},
    "Nara":{2000:1.5,2001:1.8,2002:3.6}}

frame3=pd.DataFrame(pop)
frame3

#we can transpse the Dataframe (swaps rows and columns) with similar syntax to numpy array:
frame3.T

Unnamed: 0,2001,2002,2000
Hiroshima,2.4,2.9,
Nara,1.8,3.6,1.5


In [121]:
# Keys in the inner dicts are combined and soted to form the index in the results. This isnt true if an explicit index is specified.
pd.DataFrame(pop,index=[2001,2002,2003,2000])

Unnamed: 0,Hiroshima,Nara
2001,2.4,1.8
2002,2.9,3.6
2003,,
2000,,1.5


In [122]:
# Dict of series are treated in much the same way:
pdata={"Nara":frame3["Nara"][:-1],
        "Hiroshima":frame3["Hiroshima"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Nara,Hiroshima
2001,1.8,2.4
2002,3.6,2.9


In [123]:
# if a dataframe index and column have their name attributes set,
frame3.index.name="year"; frame3.columns.name="state"
frame3

state,Hiroshima,Nara
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.8
2002,2.9,3.6
2000,,1.5


In [124]:
# as with series the values attributes returns the data contained in the Dataframe as tow-dimensional array.
frame3.values

# if the dataframes's column have differnt dtypes, the dtypes values array will be chosen to accomodate all of the columns:
frame2.values

array([[2000, 'osaka', 2700000, nan],
       [2005, 'osaka', 2750000, -1.2],
       [2010, 'osaka', 2800000, 2.3],
       [2000, 'kyoto', 1500000, -5.7],
       [2005, 'kyoto', 1550000, nan],
       [2010, 'kyoto', 1600000, nan]], dtype=object)

In [125]:
# Index objects: it is responsible for holding the axis labels and other metadata. Any array or other sequence of labels we use constructing a series and data frame is internally converted to index.

obj=pd.Series(range(3), index=["a","b","c"])
index=obj.index
index

# index objects are immutable and cannot be modified by the user, which makes safer to share index objects among data structures.
labels=pd.Index(np.arange(2))
labels

obj2=pd.Series([1.5,2.4],index=labels)
obj2.index is labels

True

In [126]:
# index also behaves like fixed-size set: 
frame3
frame3.columns
"Nara" in frame3.columns

# unlike python sets, a pandas index can contain duplicate labels.
dup_lab=pd.Index(["Nara","Nara","osaka"])
dup_lab
# selection with duplicate labels will select all occurances of that label. 

Index(['Nara', 'Nara', 'osaka'], dtype='object')

# some index methods and Properties
1. append: concatenate the additional objects
2. difference: compute set difference as an index
3. interesection: compute set intersection
4. union: compute set union
5. isin: compute boolean array 
6. delete: compute new index element at index i deleted. 
7. drop:compute new index by deleting passed values.

In [127]:
# Reindexing: method on pandas object is reindex., which is to create a new object with the data conformed to a new index. 

obj=pd.Series([1,2,3,4],index=["d","b","c","a"])
# calling reindex in the series rearranges the index, introducing missing values.
obj2=obj.reindex(["a","b","c","e","d"])
obj2

a    4.0
b    2.0
c    3.0
e    NaN
d    1.0
dtype: float64

In [128]:
# for ordered data like time series, it is good to do some filling of values when reindexing. The method option allows to do that, such as ffill：forward fills the values.

obj3=pd.Series(["yellow","orange","blue"],index=[0,2,4])
obj3.reindex(range(6),method="ffill")


0    yellow
1    yellow
2    orange
3    orange
4      blue
5      blue
dtype: object

In [129]:
# with data frame reindex can alter either (row) index,columns or both. 
frame=pd.DataFrame(np.arange(9).reshape((3,3)),index=["b","d","c"],columns=["osaka","kyoto","amagasaki"])
frame.reindex(["b","c","d","e"]) # so e gives NaN as e wasnt present before. 


# columns can be reindexed with columns keyword:
city=["osaka","amagasaki","kobe"]
frame.reindex(columns=city)

Unnamed: 0,osaka,amagasaki,kobe
b,0,2,
d,3,5,
c,6,8,


In [130]:
# reindex by label-indexing with loc:
import pandas as pd
import numpy as np

frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index=["b", "d", "c"],
                    columns=["osaka", "kyoto", "amagasaki"])

city = ["osaka", "amagasaki", "kobe"]

# Filter out columns not in DataFrame
valid_city = [col for col in city if col in frame.columns]

result = frame.loc[["b", "c", "d"], valid_city]
result

#So, .loc is preferred whenever the row/column labels convey meaningful information, enabling clearer and more maintainable code

Unnamed: 0,osaka,amagasaki
b,0,2
c,6,8
d,3,5


# Note: reindex function arguments
1. index: new sequence to use as index.
2. method: fill method (ffill)
3. fill_value: substitute value to use when introducing missig data by reindexing. 

In [131]:
# Dropping entries from an axis.: easy when we already have an index array or list without those entries.  
obj=pd.Series(np.arange(5.),index=["a","b","c","d","e"])
new_obj=obj.drop("a")
obj.drop(["b","c"])

a    0.0
d    3.0
e    4.0
dtype: float64

In [132]:
# with data frame index values can be deleted from either axis. 

data=pd.DataFrame(np.arange(16).reshape(4,4),
                index=["osaka","kyoto","shiga","Nara"],
                columns=[1,2,3,4])
data.drop(["shiga","Nara"]) # it drops the values from row label.

#dropping column values fom the column passing axis=1 or axis="columns":
data.drop(2,axis=1) #axis 1 refers to column in 2D array. 

Unnamed: 0,1,3,4
osaka,0,2,3
kyoto,4,6,7
shiga,8,10,11
Nara,12,14,15


In [133]:
data.drop([2,3],axis="columns")

Unnamed: 0,1,4
osaka,0,3
kyoto,4,7
shiga,8,11
Nara,12,15


In [134]:
# many functions like drop which modifies the shape and size of a series or DataFrame can manipulate an object in place without returning a new object.
obj.drop("c",inplace=True)  #inplace also modifies directly, alters Dataframe permanently and returns None after the operations.
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [135]:
# Indexing, Selection and Filtering: Pandas Series is more flexible than a NumPy array because it allows indexing using both explicit labels (like names or dates) and implicit integer positions. 
obj=pd.Series(np.arange(4.),index=["a","b","c","d"])
# so it can be accessed from
obj["b"]
obj[1]

obj[2:4]
obj[["b","c","a"]]
obj[[1,3]]
obj[obj<2]

  obj[1]
  obj[[1,3]]


a    0.0
b    1.0
dtype: float64

In [136]:
# slicing with labels behaves differently than normal python slicing in that the endpoint.
obj["b":"c"]

b    1.0
c    2.0
dtype: float64

 Python slicing of py_list[1:3] excludes the last item. In contrast, the Pandas slice of s_non_sequential['c':'h'] includes both the starting and ending label, 'c' and 'h'.

In [137]:
# methods modifies the corresponding section of Series:
obj["b":"c"]=6
obj

a    0.0
b    6.0
c    6.0
d    3.0
dtype: float64

In [138]:
# Indexing in DataFrame is for retrieving one or more columns either with a single value:

data=pd.DataFrame(np.arange(9).reshape(3,3),index=["osaka","chiba","Kyoto"],columns=[1,2,3])
data[2] #retrieving column 2.
data[[2,3]]
data[:2] # for the first　two rows. 
data[data[3]>5]
data<3
data[data[3]<5]=0
data

Unnamed: 0,1,2,3
osaka,0,0,0
chiba,3,4,5
Kyoto,6,7,8


In [139]:
# selection with loc and Iloc: allows to select subset of the rows and columns from the datafRAME WITH Numpy-like notation using eiher axis lables (loc) or itegers (iloc).

data.loc["chiba",[2,3]] # Selects row labeled 'chiba' and columns 'pop' and 'area' by label

# similar operation by using iloc:
data.iloc[1,[1,0]] #Selects the second row and the second and first columns (by integer index)

data.iloc[[1,2],[0,1,2]] # ]	Selects rows at indices 1 and 2, and columns at indices 0, 1, 2.

Unnamed: 0,1,2,3
chiba,3,4,5
Kyoto,6,7,8


In [140]:
# both indexing function work with slices in addition to single labels or lists of labels:
data.loc["chiba",2]

np.int64(4)

# Indexing option with DataFrame.
df[val]
df.loc[val]
df.loc[val1,val2]
df.iloc[where] 
df.iloc[:,where]
df.iloc[where_i,where_j]
reindex method

In [141]:
# Integer Indexes: some differences with indexing semantics on built in python data structure like list and tuple. 
ser=pd.Series(np.arange(6.0))
ser
#ser[-1] returns error. here we have index from 0 to 5


0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
dtype: float64

In [142]:
ser2=pd.Series(np.arange(4.),index=["a","b","c","d"])
ser2[-1]

# in non integer index, there is no potential for ambiguity.    

  ser2[-1]


np.float64(3.0)

If the index isn’t a number, Python doesn’t have to guess whether you mean a position number or a label—it’s clearly a label. But with integer indexing, sometimes the number could be both a label in the index AND a position number, which can cause confusion.

In [None]:
# use loc (for labels) and iloc(for integers):
ser.loc[:1] # select all the index up to including the label 1.
ser.iloc[:2]

0    0.0
1    1.0
dtype: float64

In [None]:
ser = pd.Series([10, 20, 30], index=[5, 10, 15])
print(ser.loc[15])   # Access by label (index value) -> 20
print(ser.iloc[:1])   # Access by position (second element) -> 20

# Use .iloc for position-based access regardless of index type.
# Use .loc for label-based access using the actual index values.

30
5    10
dtype: int64


In [None]:
# Airthmetic and data alignment: when adding objects, if any index pairs are not same, the respective index in result will be union of index pairs.
s1=pd.Series([2,3,4,5],index=["a","b","c","d"])
s2=pd.Series([3,-1,-1,0.2,0.5],index=["a","c","d","f","g"])
s1+s2
# this internal alignment introduces missing values in the label locations that dont overlap. 

a    5.0
b    NaN
c    3.0
d    4.0
f    NaN
g    NaN
dtype: float64

In [None]:
# In DataFrame, alignment is performed on both rows and columns:
data1=pd.DataFrame(np.arange(9.).reshape((3,3)),columns=list("abc"),
                   index=["osaka","nara","Kyoto"])
data2= pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list("bce"),
                    index=["hyogo","osaka","Kyoto","shiga"])
data1+data2

# indexes(rows) and column are aligned by their labels,not positions. 
# values get added when label matches.
# NaN where they dont. 
# since a and e columns are not found in both DataFrame objects, they appear as missing. 

Unnamed: 0,a,b,c,e
Kyoto,,13.0,15.0,
hyogo,,,,
nara,,,,
osaka,,4.0,6.0,
shiga,,,,


In [155]:
# if DataFrame objects without columns or row labels in commom, then the result will contain all nulls:

df1=pd.DataFrame({"A":[1,2]})
df2=pd.DataFrame({"B":[3,4]})
df1-df2

Unnamed: 0,A,B
0,,
1,,


In [165]:
# Airthmetic methods with fill values: when we want to fill value, in axis label found in onr and not in another.

df1=pd.DataFrame(np.arange(12.).reshape((3,4)),
                 columns=list("abcd"))
df2=pd.DataFrame(np.arange(20.0).reshape((4,5)),
                 columns=list("abcde"))
df2.loc[1,"b"]=np.nan # sets the value in DataFrame cell aat row 1,column "b" to missing(NaN).
# using add method on df1, pass df2 and an argument to fill_value:
df1.add(df2,fill_value=0)

# when reindexing series or Dataframe, we can also apecify a different fill value:
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


# Flexible airthmetic methods.
1. add, radd : method for +
2. sub, rsub : method for -
3. div, rdiv : method for divison(/)
4. mul,rmul : method for multiplication (*)
5. floordiv, rfloordiv : method for floor divison (//)
6. pow, rpow : method for exponentiaton (**)

In [None]:
# operation between DataFrame and Series: 

arr=np.arange(12.).reshape((3,4))
arr- arr[1]
# this is referred to as broadcasting. 

array([[-4., -4., -4., -4.],
       [ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.]])

In [177]:
frame=pd.DataFrame(np.arange(12.).reshape((4,3)),
                                        columns=list("bde"),
                                        index=["osaka","kyoto","Nara","shiga"])
series=frame.iloc[0]
series
frame-series

# if an index value is not found in either DataFrame and Series matches the index of the series on the DataFrame columns broadcasting down the rows:
series2=pd.Series(range(3),index=["b","e","f"])
frame+series2
series3=frame["d"]
series3

frame.sub(series3,axis="index")

Unnamed: 0,b,d,e
osaka,-1.0,0.0,1.0
kyoto,-1.0,0.0,1.0
Nara,-1.0,0.0,1.0
shiga,-1.0,0.0,1.0
