In [15]:
import numpy as np
from pandas import Series, DataFrame

## -> Series is 1D
array and index are its attributes. On printing it we get indexes on default with the values on right. Including the data type

In [3]:
# demo code
a = Series([12,13,14,15,16], dtype='float64')
print(a)

0    12.0
1    13.0
2    14.0
3    15.0
4    16.0
dtype: float64


In [4]:
print(a.array)
print(a.index)

<NumpyExtensionArray>
[12.0, 13.0, 14.0, 15.0, 16.0]
Length: 5, dtype: float64
RangeIndex(start=0, stop=5, step=1)


In [5]:
# general production series
b = Series([1,2,3,4,5], index = ['a','b','c','d','e'])
print(b)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [7]:
# Selection with indexes
print(b['a'])
print()
print(b[['a','c','e']])

1

a    1
c    3
e    5
dtype: int64


In [10]:
# operations on Series
print(b)
print(b[b>3])
print(b * b)
print(np.exp(b))

a    1
b    2
c    3
d    4
e    5
dtype: int64
d    4
e    5
dtype: int64
a     1
b     4
c     9
d    16
e    25
dtype: int64
a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
dtype: float64


In [11]:
# some boolean use cases
print('b' in b)
print('w' in b)

True
False


## -> A series can also be made by directly passing in a dictionary just like an array.
## -> A Series can be converted back to a dictionary with its to_dict method.

In [20]:
import pandas

print(a, b)

print(pandas.isna(a))
print(pandas.isna(b))

0    12.0
1    13.0
2    14.0
3    15.0
4    16.0
dtype: float64 a    1
b    2
c    3
d    4
e    5
dtype: int64
0    False
1    False
2    False
3    False
4    False
dtype: bool
a    False
b    False
c    False
d    False
e    False
dtype: bool


In [21]:
print(pandas.isna(a))
print(pandas.notna(b))

0    False
1    False
2    False
3    False
4    False
dtype: bool
a    True
b    True
c    True
d    True
e    True
dtype: bool


## -> Series can be added to eachother seamlessly and the indexes are included in a systematic way. Lets say we have mumbai in both series. Their values will be added. And the ones which are not present in 1 but present in other get included too

In [22]:
b.name = 'sample'
b.index.name = 'student'

In [23]:
b

student
a    1
b    2
c    3
d    4
e    5
Name: sample, dtype: int64

## -> A series and the index of the Series have their own name attributes.

## -> A Series’s index can be altered in place by assignment

# DataFrame

## -> DataFrame represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be a different value type. Can be used to represent 2D and higher dimensions.

In [2]:
# sample to most common method of creation
import pandas as pd

data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
print(frame)

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [3]:
# selection of first 5 rows
print(frame.head())
print(frame.head(7))

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [4]:
print(frame.tail(3))

    state  year  pop
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [11]:
# qnother way to create a dataframe
import pandas as pd

col = ["name", "age", "speed"]
name = ["varun", "kishan", "sam", "adi", "ram"]
age = [20, 12, 40, 24, 30]
speed = [100,120,60,55, 150]

a = pd.DataFrame([[name,age,speed]], columns= col)
print(a)  

# i dont like this way. the dictionary seems better

                             name                   age  \
0  [varun, kishan, sam, adi, ram]  [20, 12, 40, 24, 30]   

                     speed  
0  [100, 120, 60, 55, 150]  


In [13]:
# access columns
print(a.columns)

Index(['name', 'age', 'speed'], dtype='object')


In [42]:
a['name']
a.name = 'demooo'
print(a.T)

                             0
name                    demooo
age       [20, 12, 40, 24, 30]
speed  [100, 120, 60, 55, 150]


In [22]:
# sample to most common method of creation
import pandas as pd

data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data, index = ['a','b','c','d', 'e', 'f'])
print(frame)

    state  year  pop
a    Ohio  2000  1.5
b    Ohio  2001  1.7
c    Ohio  2002  3.6
d  Nevada  2001  2.4
e  Nevada  2002  2.9
f  Nevada  2003  3.2


In [23]:
print(frame.loc['a'])

state    Ohio
year     2000
pop       1.5
Name: a, dtype: object


In [25]:
print(frame.loc['f', 'pop'])

3.2


In [29]:
print(frame.loc['a':'d', ['pop', 'year']])

   pop  year
a  1.5  2000
b  1.7  2001
c  3.6  2002
d  2.4  2001


In [31]:
print(frame.iloc[1:,1:])

   year  pop
b  2001  1.7
c  2002  3.6
d  2001  2.4
e  2002  2.9
f  2003  3.2


In [32]:
# assignment or modification of values
frame['debt'] = 1.2
frame.iloc[1,3] = 2
print(frame)

    state  year  pop  debt
a    Ohio  2000  1.5   1.2
b    Ohio  2001  1.7   2.0
c    Ohio  2002  3.6   1.2
d  Nevada  2001  2.4   1.2
e  Nevada  2002  2.9   1.2
f  Nevada  2003  3.2   1.2


In [34]:
frame["eastern"] = frame["state"] == "Ohio"
print(frame)

    state  year  pop  debt  eastern
a    Ohio  2000  1.5   1.2     True
b    Ohio  2001  1.7   2.0     True
c    Ohio  2002  3.6   1.2     True
d  Nevada  2001  2.4   1.2    False
e  Nevada  2002  2.9   1.2    False
f  Nevada  2003  3.2   1.2    False


In [35]:
del frame['eastern']
print(frame.columns)

Index(['state', 'year', 'pop', 'debt'], dtype='object')


## -> You can transpose the DataFrame (swap rows and columns) with similar syntax to a NumPy array:

In [36]:
print(frame.T)
print(frame)

          a     b     c       d       e       f
state  Ohio  Ohio  Ohio  Nevada  Nevada  Nevada
year   2000  2001  2002    2001    2002    2003
pop     1.5   1.7   3.6     2.4     2.9     3.2
debt    1.2   2.0   1.2     1.2     1.2     1.2
    state  year  pop  debt
a    Ohio  2000  1.5   1.2
b    Ohio  2001  1.7   2.0
c    Ohio  2002  3.6   1.2
d  Nevada  2001  2.4   1.2
e  Nevada  2002  2.9   1.2
f  Nevada  2003  3.2   1.2


In [38]:
frame.columns.name = 'variables'
frame.index.name = 'name'
print(frame)

variables   state  year  pop  debt
name                              
a            Ohio  2000  1.5   1.2
b            Ohio  2001  1.7   2.0
c            Ohio  2002  3.6   1.2
d          Nevada  2001  2.4   1.2
e          Nevada  2002  2.9   1.2
f          Nevada  2003  3.2   1.2


In [40]:
# conversion to numpy

np = frame.to_numpy()
print(np)
print(np.dtype)

[['Ohio' 2000 1.5 1.2]
 ['Ohio' 2001 1.7 2.0]
 ['Ohio' 2002 3.6 1.2]
 ['Nevada' 2001 2.4 1.2]
 ['Nevada' 2002 2.9 1.2]
 ['Nevada' 2003 3.2 1.2]]
object


In [1]:
# Day 4

In [10]:
# reindexing with reindex()
# incase we want to reearrange or add new index in the data set we use reindex

import pandas as pd

data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
a = pd.DataFrame(data, index = ['d', 'a', 'b', 'c', 'f', 'e'])
a
a = a.reindex(['a', 'b', 'c', 'd','e','k', 'f'])
print(a)

    state    year  pop
a    Ohio  2001.0  1.7
b    Ohio  2002.0  3.6
c  Nevada  2001.0  2.4
d    Ohio  2000.0  1.5
e  Nevada  2003.0  3.2
k     NaN     NaN  NaN
f  Nevada  2002.0  2.9


In [11]:
import pandas as pd

data = {"sales": [100, 200, 300]}
df = pd.DataFrame(data, index=["day1", "day2", "day3"])
print("Original:")
print(df)

# Missing label added
new_index = ["day1", "day2", "day3", "day4"]

# Normal reindex
print("\nNormal reindex:")
print(df.reindex(new_index))

# Reindex with fill_value
print("\nReindex with fill_value:")
print(df.reindex(new_index, fill_value=0))

# Forward fill method
print("\nForward fill (ffill):")
print(df.reindex(new_index, method="ffill"))


Original:
      sales
day1    100
day2    200
day3    300

Normal reindex:
      sales
day1  100.0
day2  200.0
day3  300.0
day4    NaN

Reindex with fill_value:
      sales
day1    100
day2    200
day3    300
day4      0

Forward fill (ffill):
      sales
day1    100
day2    200
day3    300
day4    300


In [22]:
# axis method of reindex
import pandas as pd
a = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(a, index = ['a','b','c','d', 'e', 'f'])
frame.reindex(['year', 'state', 'pop'], axis = 'columns')


Unnamed: 0,year,state,pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


In [25]:
# drop()
#frame.drop(index=['a'])
frame.drop(['b'])

Unnamed: 0,a,b,c
a,Ohio,2000,1.5
c,Ohio,2002,3.6
d,Nevada,2001,2.4
e,Nevada,2002,2.9
f,Nevada,2003,3.2


In [26]:
frame.drop(['b'], axis = 1)    # or frame.drop(columns = ['a'])

Unnamed: 0,a,c
a,Ohio,1.5
b,Ohio,1.7
c,Ohio,3.6
d,Nevada,2.4
e,Nevada,2.9
f,Nevada,3.2


In [35]:
frame.columns = ['year', 'state', 'pop']

In [36]:
#slicing
frame[frame['pop'] < 3]

Unnamed: 0,year,state,pop
a,Ohio,2000,1.5
b,Ohio,2001,1.7
d,Nevada,2001,2.4
e,Nevada,2002,2.9


In [37]:
# ways to change the name of specific column. this is for specific cols. For all columns use .columns is ok. for index write index

import pandas as pd

df = pd.DataFrame({
    'A': [1, 2],
    'B': [3, 4]
})

df = df.rename(columns={'A': 'Alpha', 'B': 'Beta'})
print(df)

   Alpha  Beta
0      1     3
1      2     4


In [2]:
import pandas as pd

df = pd.DataFrame(
    { 'one' : [1, 3, 5, 3],
      'two' : [0, 3, 5, 3],
      'three' : [0, 3, 6, 3],
      'four' : [0, 3, 5, 3]
    }, index = ['ohio', 'colorado', 'utah', 'newyork'])

df

Unnamed: 0,one,two,three,four
ohio,1,0,0,0
colorado,3,3,3,3
utah,5,5,6,5
newyork,3,3,3,3


In [10]:
df.loc[df['three'] == 3] = 8
df

Unnamed: 0,one,two,three,four
ohio,1,0,0,0
colorado,8,8,8,8
utah,5,5,6,5
newyork,8,8,8,8


In [11]:
df

Unnamed: 0,one,two,three,four
ohio,1,0,0,0
colorado,8,8,8,8
utah,5,5,6,5
newyork,8,8,8,8


In [12]:
# do + operation on 2 series or dataframes 

In [17]:
import pandas as pd

# Create two Series with partially overlapping indices
s1 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2 = pd.Series([1, 2, 3], index=['b', 'c', 'd'])

# Add the Series
result_series = s1 + s2
print(result_series)


a     NaN
b    21.0
c    32.0
d     NaN
dtype: float64


In [19]:
# Create two DataFrames with partially overlapping row and column indices
df1 = pd.DataFrame({
    'X': [1, 2],
    'Y': [3, 4]
}, index=['row1', 'row2'])

df2 = pd.DataFrame({
    'Y': [5, 6],
    'Z': [7, 8]
}, index=['row2', 'row3'])

# Add the DataFrames
result_df = df1 + df2
print(result_df)

# better option

r = df1.add(df2, fill_value = 0.0)   # here 0 is filled only if both dataframes index matches and one of them have nan else its nan only.
r

       X    Y   Z
row1 NaN  NaN NaN
row2 NaN  9.0 NaN
row3 NaN  NaN NaN


Unnamed: 0,X,Y,Z
row1,1.0,3.0,
row2,2.0,9.0,7.0
row3,,6.0,8.0


In [20]:
# we can also perform / operation.
# other operations include: sub, div, floordiv, mul, pow

In [4]:
import numpy as np
import pandas as pd

a = pd.DataFrame(np.arange(1, 16).reshape(3,5))
print(a)
print()


    0   1   2   3   4
0   1   2   3   4   5
1   6   7   8   9  10
2  11  12  13  14  15



## function application and mapping

In [12]:
import pandas as pd
a = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(a, index = ['a','b','c','d', 'e', 'f'])

def f1(z):
    return x.max() - x.min()

frame['pop'].apply(lambda x: frame['pop'].max() - frame['pop'].min()) # changes all values of pop to 2.1

a    2.1
b    2.1
c    2.1
d    2.1
e    2.1
f    2.1
Name: pop, dtype: float64

In [13]:
frame

Unnamed: 0,state,year,pop
a,Ohio,2000,1.5
b,Ohio,2001,1.7
c,Ohio,2002,3.6
d,Nevada,2001,2.4
e,Nevada,2002,2.9
f,Nevada,2003,3.2


In [16]:
import pandas as pd
import numpy as np
obj = pd.DataFrame(np.arange(4), index = ['a', 'c', 'd', 'b'])
obj.sort_index()
obj

Unnamed: 0,0
a,0
c,1
d,2
b,3


In [21]:
obj.sort_index(key=lambda x: [str(i) for i in x])

Unnamed: 0,0
a,0
b,3
c,1
d,2


In [None]:
# study sorting and rank