# Pandas Extended Guide

## Creating DF and Series

In [1]:
# import pandas
import pandas as pd
import numpy as np

### creating Series

In [2]:
# creating series from list
ser = pd.Series(
    data=[3, 4, 5, 2], 
    name="Marks", 
    copy=True,
    index=["student1", "student2", "student3", "student4"], # taking all the rows
    dtype="float64"
    )
ser


student1    3.0
student2    4.0
student3    5.0
student4    2.0
Name: Marks, dtype: float64

In [3]:
#creating series from dict
data = {
    "row1" : 2323,
    "row2" : 808,
    "row3" : 2353,
}
ser1 = pd.Series(
    data, 
    dtype="string", 
    name="shity_column", 
    index=["row1", "row2"] # taking only some of the rows
)
ser1

row1    2323
row2     808
Name: shity_column, dtype: string

In [4]:
# creating series from ndarray
arr = np.arange(1, 100)
pand = pd.Series(arr, name="series_from_ndarray", copy=True)
pand

0      1
1      2
2      3
3      4
4      5
      ..
94    95
95    96
96    97
97    98
98    99
Name: series_from_ndarray, Length: 99, dtype: int64

### creating Data Frame

In [5]:
datum = np.arange(100).reshape(20, -1) # turning ndarray into dataframe
df_ndarr = pd.DataFrame(
    data= datum, 
    columns=["col1", "col2", "col3", "col4", "col5"],
    copy = True
)
df_ndarr

Unnamed: 0,col1,col2,col3,col4,col5
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [6]:
glos = {
    "col1" : [1, 4, 3, 5, 66, 34, 234, 76],
    "col2" : [1, 34, 3, 134, 66, 3454, 345, 2342],
    "col3" : [435, 4, 345345, 234, 66, 23423, 234, 76]
}

df_gloss = pd.DataFrame(
    glos, 
    columns=["col1", "col2"],
    index=[i for i in range(100, 108)]
)
df_gloss

Unnamed: 0,col1,col2
100,1,1
101,4,34
102,3,3
103,5,134
104,66,66
105,34,3454
106,234,345
107,76,2342


In [7]:
# creating data frame
df = pd.DataFrame(
    [i for i in range(100)],
    ['a' + str(i) for i in range(100)]
)
df

Unnamed: 0,0
a0,0
a1,1
a2,2
a3,3
a4,4
...,...
a95,95
a96,96
a97,97
a98,98


In [8]:
# creating named data frame
df = pd.DataFrame({
    "number": [i for i in range(100)],
    "char": ['a' + str(i) for i in range(100)]
})
df

Unnamed: 0,number,char
0,0,a0
1,1,a1
2,2,a2
3,3,a3
4,4,a4
...,...,...
95,95,a95
96,96,a96
97,97,a97
98,98,a98


In [9]:
df['number']

0      0
1      1
2      2
3      3
4      4
      ..
95    95
96    96
97    97
98    98
99    99
Name: number, Length: 100, dtype: int64

In [10]:
type(df['number'])

pandas.core.series.Series

In [11]:
df[['number']]

Unnamed: 0,number
0,0
1,1
2,2
3,3
4,4
...,...
95,95
96,96
97,97
98,98


In [12]:
type(df[['number']])

pandas.core.frame.DataFrame

### index allignment

In [13]:
ser1 = pd.Series(name="col1", data=[i for i in range(1000, 1100)], index=[i for i in range(2000, 2100)])
ser2 = pd.Series(name="col2", data=[i for i in range(7000, 7100)], index=[i for i in range(2050, 2150)])
merged = pd.DataFrame({
    "col1": ser1,
    "col2": ser2
})
merged

Unnamed: 0,col1,col2
2000,1000.0,
2001,1001.0,
2002,1002.0,
2003,1003.0,
2004,1004.0,
...,...,...
2145,,7095.0
2146,,7096.0
2147,,7097.0
2148,,7098.0


## Reading data sources

### Reading CSV

In [14]:
new_df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
    parse_dates=["famsize"], # the cols that should be parsed into date
    nrows= 395,
    encoding="UTF-8",
)
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

  new_df = pd.read_csv(


### Writing into CSV

In [15]:
new_df.to_csv(
    path_or_buf="new_csv.csv",
    sep=",",
    columns=["school", "age"],
    header=["школа", "возраст"],
    index= True,
    index_label="index"
)

### Preparing SQL

In [16]:
import sqlite3 as sq

def create_dummy_table(db="data.db", path="./math_students.csv", table_name="data_table"):
    with sq.connect(db) as con:
        cur = con.cursor()
        df = pd.read_csv(path)
        df.to_sql(name=table_name, if_exists="replace", index=False, con=con)
sql_query = '''SELECT * FROM data_table'''
create_dummy_table()

### Reading SQL

In [17]:
with sq.connect("data.db") as con:
    sql_read_df = pd.read_sql(
        sql = sql_query,
        con = con
    )
sql_read_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


### Writing to SQL

In [18]:
with sq.connect("data.db") as con:
    sql_read_df.to_sql(
        con = con, 
        name="new_table_damn",
        if_exists="fail",
    )
sql_read_df

ValueError: Table 'new_table_damn' already exists.

## Writing to Python objects

In [None]:
# preparing data
df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
)

ser = pd.Series(
    np.arange(100),
    name = "nums",
    copy= True,
)

#### Series

In [None]:
ser.to_list() # returns list 
type(ser.to_list()) # list


ser.to_dict()
type(ser.to_dict()) # python dict


ser.to_numpy() # nd array

#### DataFrame

In [None]:
df.to_dict() # python dict of dicts, columns <-> keys
type(df.to_dict())


df.to_dict(orient="records") # list of dicts, ready to be converted to JSON


df.to_numpy() # getting an nd_array of rows


## Indexes

In [None]:
index_list = list("abirvalg")
index_data = pd.Index(index_list, name="rows")
type(index_data)

### Getting indexes

In [None]:
new_df = pd.DataFrame(
    data= np.arange(100).reshape(10, 10),
    copy=True
)

In [None]:
received_index = new_df.index
received_index

In [None]:
received_columns = new_df.columns
received_columns

### Changing indexes

In [None]:
new_df.index = pd.Index(list(range(10, 20)))
new_df.index

### Turning indexes into numpy or lists

In [None]:
listus = new_df.index.to_list()
type(listus)

In [None]:
nump = new_df.index.to_numpy()
type(nump)

### Unique values

In [None]:
df = pd.read_csv(
    filepath_or_buffer= "math_students.csv",
    delimiter= ",",
)
index_data = df.index
column_index_data = df.columns
index_data.unique() # RangeIndex(start=0, stop=395, step=1)
index_data.nunique() # number of unique elements 395
index_data.duplicated() # boolean list showing wether the element repeats

In [None]:
index_data.name = "new index name"
column_index_data.name = "new column name"
df

### Renaming columns

In [None]:
df.rename(
    columns={
        "school":"школа"
    }
)
# this method returns a new data frame

### Gaps in indexes

In [None]:
ind = pd.Index([0, 1, np.nan, 4, np.nan, np.nan], name = "row indexes")
df_nan = pd.DataFrame(
    data = df[:][:5],
    index=ind,
)
df_nan.index.hasnans # does index have nans?

In [None]:
df_nan.index.isna() # does index have nans?

In [None]:
df_nan.index.dropna()

## Access to data

In [None]:
df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
    parse_dates=["famsize"], # the cols that should be parsed into date
    nrows= 395,
    encoding="UTF-8",
)

df.index.rename("index", inplace=True)
df.set_index(
    ["school"],
    append = True,
    inplace= True
)
df

### deleting index

In [None]:
df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
    parse_dates=["famsize"], # the cols that should be parsed into date
    nrows= 395,
    encoding="UTF-8",
)
df.set_index("school", inplace=True)
df.reset_index()

### Getting data by index

In [None]:
df["school"] # returns series

In [None]:
df[["school"]] # returns DataFrame

In [None]:
df["school"] = "Engineering school"
df[["new_column"]] = "some_info"
df["new_new_column"] = df["age"]
df

### deleting column from DataFrame

In [None]:
df["new_new_column"] = df["age"]
del df["new_new_column"]
df

### using boolean mask on data frame

In [None]:
df[:3][[True, True, False]] # returns only first and second rows

# we can also change values like that

### combining rows and columns selection

In [None]:
df[23:45]["Mjob"]

### another way of getting columns

In [None]:
df.school

### Getting data from series by index

In [None]:
ser = df["age"]
ser.name = "ages"
ser[1] # getting single element

ser[1:4] # getting a slice

## .loc() and .iloc()

### Series

In [None]:
ser = df["age"]
ser.loc[1:10]

In [None]:
ser.iloc[3:12]

### DataFrame

In [None]:
df.loc[3] # returns the 4th row as an object

df.loc[1:3] # returns rows with indexes 1-3 as a dataframe

df.loc[1:3, ["school", "age"]] # returns rows with indexes 1-3 and column school as a series

df.loc[1:3, "school":"age"] # using slices to pick rows and columns

In [None]:
df.iloc[3] # third row

df.iloc[3:9] # rows from 3 to 9

df.iloc[2:5, 4:9] # now we can get columns by their indexes

## Iterating through keys and values

### series

In [None]:
ser = pd.Series(
    np.random.randint(1, 100, 100),
    name = "nums",
)
ser.keys() # returns the keys of the dict <-> series

ser.values # returns the values as a ndarray 

### data frame

In [None]:
df.keys() # returns all the column labels

## Multiindex

In [None]:
df

In [None]:
data = [[111, 222, 333, 444], ["ind1", "ind2", "ind3", "ind4"]] # length should be equal
mult_i = pd.MultiIndex.from_arrays(arrays=data, names=["level0", "level1"])
mult_i

In [None]:
mdf = pd.DataFrame(np.arange(16).reshape(4, 4), columns=mult_i)
mdf

In [None]:
import pandas as pd

df = pd.DataFrame({
    'year': [2023, 2023, 2024],
    'quarter': ['Q1', 'Q2', 'Q1'],
    'value': [100, 150, 200]
})
df

In [None]:
df_multi = df.set_index(['year', 'quarter'])
df_multi

### from_arrays

In [None]:
df_multi.index = pd.MultiIndex.from_arrays(
    [[2023, 2023, 2024], ['Q1', 'Q2', 'Q1']],
    names=['year', 'quarter']
)
df_multi

### from_tuples

In [None]:
df_multi.index = pd.MultiIndex.from_tuples(
    [(2023, 'Q1'), (2023, 'Q2'), (2024, 'Q1')],
    names=['year', 'quarter']
)
df_multi

### set_index

In [None]:
df = pd.DataFrame({
    'year': [2023, 2023, 2024],
    'quarter': ['Q1', 'Q2', 'Q1'],
    'value': [100, 150, 200]
})
df_multi = df.set_index(['year', 'quarter'])
df_multi

### set_levels()

In [None]:
# Пусть у нас уже есть MultiIndex
index = pd.MultiIndex.from_tuples([(2023, 'Q1'), (2023, 'Q2'), (2024, 'Q1')], names=['year', 'quarter'])
df = pd.DataFrame({'value': [100, 150, 200]}, index=index)

# Меняем уровень 0 (year) с 2023,2023,2024 на 1,1,2
df.index = df.index.set_levels([ [1, 2], ['Q1', 'Q2'] ])

df

## Condiditon select

### Condition select on Series

In [None]:
s = pd.Series(
    data={
        "USA":28,
        "Russia":12,
        "China":10,
        "Korea":4,
        "Germany":2
    }
)

mask = s > 7 # getting boolean mask on the series
s[mask] # getting only those rows that -> true


### Condition select on DataFrame

In [None]:
df = pd.read_csv(
    filepath_or_buffer="./math_students.csv"
)
mask = ((df["age"] > 15) & (df["failures"] > 2))
df[mask]

## Filtering data frame

### query

In [None]:
df.query('char == "oh shit"')

### loc

In [None]:
df.loc[df.char == "oh shit", :]

### apply

In [None]:
func = lambda x : x + "_supershit"
df.char.apply(func)

### numpy magic

In [None]:
df.char += "_metashit"
df.char

## Operation with series

#### simple operations

In [None]:
score = pd.Series(
    np.random.randint(100000, 2000000, 100),
    name="scores"
)

score = (score / score.max()) * 100
score.name = "score in percent"
score

#### applying functions

In [None]:
score = pd.Series(
    np.random.randint(100000, 2000000, 100),
    name="scores"
)


powerize = lambda x : x**2
score.apply(powerize) # just applying lambda to all of the elements

In [None]:
score = pd.Series(
    np.random.randint(100000, 2000000, 100),
    name="scores"
)


score.max()

score.min()

score.mean() # average

score.median() # median value

score.var() # variation

score.std() # standart decline

score.sum() # sum of the series

#### multyplying series

In [None]:
usd_sales = pd.Series(np.random.randint(1, 100, 4), index=["jan", "feb", "march", "march"])

usd_to_rub = pd.Series(np.random.randint(80, 110, 12), index=["jan", "feb", "march", "april", "may", "june", "july", "august", "sept", "oct", "nov", "dec"])

sales_in_rub = usd_sales * usd_to_rub

sales_in_rub.dropna() # result without NaN

#### .multyply()

In [None]:
sales_in_rub = usd_sales.multiply(usd_to_rub, fill_value=0)

sales_in_rub

## Series dtype

In [None]:
ser = pd.Series(np.arange(100))

ser.dtype # dtype('int64')

ser = pd.Series([1, 3, "3"])

ser.dtype # dtype('O') <-> object

ser = pd.Series([1, 3, np.nan])

ser.dtype # dtype('float64')

ser = pd.Series([1, 3, pd.NA])

ser.dtype # dtype('O') <-> object

#### Convertation

In [None]:
ser = pd.Series([1, 2, 3])
ser = ser.astype("float64")  # → float


pd.Series(['1', '2', '3']).astype(int)        # → int64
pd.Series([1.0, 2.0, 3.5]).astype(int)        # → floor round


pd.Series(['2020-01-01', '2021-01-01']).astype('datetime64[ns]')


pd.Series(['low', 'medium', 'high']).astype('category')


# nullable types

s = pd.Series([1, None])
s.astype("Int64")
s.astype("int64")      # raises an error

##### Type
<=>
##### Nullable type
int
'Int64'

float
'Float64'

bool
'boolean'

str
'string'

## Deleting Series row

In [None]:
usd_to_rub = pd.Series(np.random.randint(80, 110, 12), index=["jan", "feb", "march", "april", "may", "june", "july", "august", "sept", "oct", "nov", "dec"])

del usd_to_rub["feb"] # deletes single row

print("feb" in usd_to_rub.keys()) # False

# deletes multiple rows
usd_to_rub.drop(["march", "april", "may"], inplace=True)
usd_to_rub

## Operations with DataFrames

#### Condition select

In [None]:
df = pd.read_csv(
    filepath_or_buffer="./math_students.csv",
    delimiter=",",
)

df[df["absences"]>25] # condidition select those who were absent more than 25 times

df[df["age"]>df["age"].mean()] # older than average level


#### DataFrame concatination and reindexation by Multiindex

In [None]:
from pandas import Index, DataFrame
winter_index = Index(data=['dec', 'jan', 'feb'])
summer_index = Index(data=['jun', 'jul', 'aug'])
year_index = Index(data=[
    'jan', 'feb', 'mar',
    'apr', 'may', 'jun',
    'jul', 'aug', 'sep',
    'oct', 'nov', 'dec'
])


winter_sales = DataFrame(
    index=winter_index,
    data={
        'sales': [100, 200, 300],
        'profit': [10, 20, 30]
    }
)


summer_sales = DataFrame(
    index=summer_index,
    data={
        'sales': [10, 20, 30],
        'profit': [1, 2, 3]
    }
)

data=[
    'jan', 'feb', 'mar',
    'apr', 'may', 'jun',
    'jul', 'aug', 'sep',
    'oct', 'nov', 'dec'
]


total_sales = pd.concat(objs=[winter_sales, summer_sales])
new_index = pd.MultiIndex.from_arrays(
    arrays=[
        (["H1" for _ in range(3)] + ["H2" for _ in range(3)]), 
        [
            'jan', 'feb','sep',
            'oct', 'nov', 'dec'
        ]]
)
total_sales.reset_index(inplace=True)
total_sales.index = new_index
total_sales

#### Sorting DataFrame

In [94]:
def get_df(path = "./math_students.csv"):
    df = pd.read_csv(
        filepath_or_buffer= path,
        delimiter=",",
        encoding="UTF-8",
    )
    return df.copy()

df = get_df()
df.sort_values(by=["age", "absences"], ascending=[True, False])

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
103,GP,F,15,U,GT3,T,3,2,services,other,...,4,3,5,1,1,2,26,7,6,6
69,GP,F,15,R,LE3,T,3,1,other,other,...,4,4,2,2,3,3,12,16,16,16
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
82,GP,F,15,U,LE3,T,3,2,services,other,...,4,4,4,1,1,5,10,7,6,6
105,GP,F,15,U,GT3,A,3,3,other,health,...,4,3,3,1,1,4,10,10,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
376,MS,F,20,U,GT3,T,4,2,health,other,...,5,4,3,1,1,3,4,15,14,15
306,GP,M,20,U,GT3,A,3,2,services,other,...,5,5,3,1,1,5,0,17,18,18
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7


#### Excel-like calculations

In [None]:
df = get_df(path="wage.csv")

work_month = pd.Index(np.random.randint(1, 12, 1006))
df["work_month"] = work_month

df["pre-total"] = df["work_month"] * df["wage"]
df["total"] = df["work_month"] * df["wage"]
df.loc[df["gender"]==1, "total"] *= 1.05

df["rate"] = 100 * df["total"] / df["total"].max() 
(df.loc[df["gender"]==1, "total"].sum() / df["total"].sum()) * 100

#### Applying functions to DataFrame

In [None]:
df = get_df(path="wage.csv")
work_month = pd.Index(np.random.randint(1, 12, 1006))
df["work_month"] = work_month
df["pre-total"] = df["work_month"] * df["wage"]
df["total"] = df["work_month"] * df["wage"]
df.loc[df["gender"]==1, "total"] *= 1.05

USD_to_RUB = lambda x : x*100

df["total_rub"] = df["total"]
df["total_rub"] = df["total_rub"].apply(USD_to_RUB) # applying lambda function

##### Function with direction

In [None]:
quarterly_financials = pd.DataFrame(
    data=[
        [1788, 1932, 1954, 2413],  # Revenue
        [612, 749, 778, 594],      # Operating income
        [505, 580, 604, 508]       # Net income
    ],
    columns=['1q', '2q', '3q', '4q'],
    index=['Revenue', 'Operating income', 'Net income']
)


total_by_year = quarterly_financials.sum( # choosing the direction
    axis='columns',
)
quarterly_financials["total"] = total_by_year

In [None]:
# long way
quarterly_financials.loc["Revenue, %"] = quarterly_financials.loc["Revenue"] * 100 / quarterly_financials.loc["Revenue", "total"]
quarterly_financials.loc["Operating income, %"] = quarterly_financials.loc["Operating income"] * 100 / quarterly_financials.loc["Operating income", "total"]
quarterly_financials.loc["Net income, %"] = quarterly_financials.loc["Net income"] * 100 / quarterly_financials.loc["Net income", "total"]
quarterly_financials

In [None]:
# another way is to devide a DF on Ser along rows
quarterly_financials = pd.DataFrame(
    data=[
        [1788, 1932, 1954, 2413],  # Revenue
        [612, 749, 778, 594],      # Operating income
        [505, 580, 604, 508]       # Net income
    ],
    columns=['1q', '2q', '3q', '4q'],
    index=['Revenue', 'Operating income', 'Net income']
)


total_by_year = quarterly_financials.sum( # choosing the direction
    axis='columns',
)
quarterly_financials["total"] = total_by_year
quarterly_financials.loc["Revenue, %"] = quarterly_financials.loc["Revenue"]
quarterly_financials.loc["Operating income, %"] = quarterly_financials.loc["Operating income"]
quarterly_financials.loc["Net income, %"] = quarterly_financials.loc["Net income"]
quarterly_financials.loc["Revenue, %":"Net income, %"] = quarterly_financials.loc[
    "Revenue, %":"Net income, %"
].divide(quarterly_financials.loc[
    ['Revenue, %', 'Operating income, %', 'Net income, %'], "total"] / 100,
    axis=0
)
quarterly_financials


#### Advanced axis functions

In [44]:
stock_prices = pd.DataFrame(
    data={
        'MSFT': [417, 416, 417, 420, 430],
        'IBM':  [224, 222, 219, 221, 220],
        'AAPL': [225, 225, 226, 226, 233]
    },
    index=[
        '2024-10-03',
        '2024-10-02',
        '2024-10-01',
        '2024-09-30',
        '2024-09-29'
    ]
)

In [45]:
stock_prices.loc["period_mean"] = stock_prices.mean(axis=0)
stock_prices.loc["period_median"] = stock_prices.median(axis=0)
stock_prices

Unnamed: 0,MSFT,IBM,AAPL
2024-10-03,417.0,224.0,225.0
2024-10-02,416.0,222.0,225.0
2024-10-01,417.0,219.0,226.0
2024-09-30,420.0,221.0,226.0
2024-09-29,430.0,220.0,233.0
period_mean,420.0,221.2,227.0
period_median,418.5,221.1,226.0


In [46]:
stock_prices.loc[:, "daily_total_mean"] = stock_prices.mean(axis=1)
stock_prices = pd.concat([stock_prices, stock_prices.loc["2024-10-03":"2024-09-29"].divide(stock_prices.loc["period_mean"], axis=1)])
stock_prices

Unnamed: 0,MSFT,IBM,AAPL,daily_total_mean
2024-10-03,417.0,224.0,225.0,288.666667
2024-10-02,416.0,222.0,225.0,287.666667
2024-10-01,417.0,219.0,226.0,287.333333
2024-09-30,420.0,221.0,226.0,289.0
2024-09-29,430.0,220.0,233.0,294.333333
period_mean,420.0,221.2,227.0,289.4
period_median,418.5,221.1,226.0,288.533333
2024-10-03,0.992857,1.012658,0.991189,0.997466
2024-10-02,0.990476,1.003617,0.991189,0.994011
2024-10-01,0.992857,0.990054,0.995595,0.992859


#### Some practice

In [None]:
nums = pd.DataFrame(np.tile(np.arange(10), (10, 1)))
nums


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,2,3,4,5,6,7,8,9
1,0,1,2,3,4,5,6,7,8,9
2,0,1,2,3,4,5,6,7,8,9
3,0,1,2,3,4,5,6,7,8,9
4,0,1,2,3,4,5,6,7,8,9
5,0,1,2,3,4,5,6,7,8,9
6,0,1,2,3,4,5,6,7,8,9
7,0,1,2,3,4,5,6,7,8,9
8,0,1,2,3,4,5,6,7,8,9
9,0,1,2,3,4,5,6,7,8,9


In [None]:
nums.subtract(np.arange(10), axis=1) # vector-row
nums.subtract(np.arange(10), axis=0) # vector-column

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,2,3,4,5,6,7,8,9
1,-1,0,1,2,3,4,5,6,7,8
2,-2,-1,0,1,2,3,4,5,6,7
3,-3,-2,-1,0,1,2,3,4,5,6
4,-4,-3,-2,-1,0,1,2,3,4,5
5,-5,-4,-3,-2,-1,0,1,2,3,4
6,-6,-5,-4,-3,-2,-1,0,1,2,3
7,-7,-6,-5,-4,-3,-2,-1,0,1,2
8,-8,-7,-6,-5,-4,-3,-2,-1,0,1
9,-9,-8,-7,-6,-5,-4,-3,-2,-1,0


In [70]:
nums * np.arange(10)
nums.multiply(np.arange(10), axis=0) # vector-column
nums.multiply(np.arange(10), axis=1) # vector-row

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,4,9,16,25,36,49,64,81
1,0,1,4,9,16,25,36,49,64,81
2,0,1,4,9,16,25,36,49,64,81
3,0,1,4,9,16,25,36,49,64,81
4,0,1,4,9,16,25,36,49,64,81
5,0,1,4,9,16,25,36,49,64,81
6,0,1,4,9,16,25,36,49,64,81
7,0,1,4,9,16,25,36,49,64,81
8,0,1,4,9,16,25,36,49,64,81
9,0,1,4,9,16,25,36,49,64,81


In [73]:
nums.divide(np.arange(10), axis=0) # vector-column
nums.divide(np.arange(10), axis=1) # vector-row

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##### Split and Join methods

In [None]:
def get_employees():
    return pd.DataFrame(data={
        'fio': [
            'Прохоров Фёдор Артурович',
            'Копылов Иван Ибрагимович',
            'Егоров Матвей Викторович',
            'Ефимов Александр Николаевич',
            'Новикова Елизавета Константиновна',
            'Масленников Мирон Фёдорович',
            'Крылов Кирилл Михайлович',
            'Попова Надежда Борисовна',
            'Васильева София Давидовна',
            'Калмыкова Василиса Александровна'
        ]
    })

employees = get_employees()
employees = employees["fio"].str.split(
    pat=" ",
    expand= True
).set_axis(
    ["first", "second", "last"],
    axis = 1
)

Unnamed: 0,first,second,last
0,Прохоров,Фёдор,Артурович
1,Копылов,Иван,Ибрагимович
2,Егоров,Матвей,Викторович
3,Ефимов,Александр,Николаевич
4,Новикова,Елизавета,Константиновна
5,Масленников,Мирон,Фёдорович
6,Крылов,Кирилл,Михайлович
7,Попова,Надежда,Борисовна
8,Васильева,София,Давидовна
9,Калмыкова,Василиса,Александровна


In [92]:
# now join mfs back
employees_joined = pd.DataFrame()
employees_joined["full name"] = employees.agg(" ".join, axis=1)
employees_joined

Unnamed: 0,full name
0,Прохоров Фёдор Артурович
1,Копылов Иван Ибрагимович
2,Егоров Матвей Викторович
3,Ефимов Александр Николаевич
4,Новикова Елизавета Константиновна
5,Масленников Мирон Фёдорович
6,Крылов Кирилл Михайлович
7,Попова Надежда Борисовна
8,Васильева София Давидовна
9,Калмыкова Василиса Александровна


#### Renaming data in DataFrame

In [None]:
df = get_df()
assert "school" in df.columns and "sex" in df.columns
df.rename(
    columns= {
        "school":"education",
        "sex":"gender",
    },
    inplace=True
)
df.columns.tolist()

### Dammit iterations

In [96]:
df = get_df("./wage.csv")
for i in df.index:
    for j in df.columns:
        print(df.loc[i, j], end="\t")
    print()

0	1	46793.6038108271	
1	1	33481.57572005241	
2	1	44523.69908406118	
3	1	15995.576829130952	
4	0	10282.631223900877	
5	1	65464.53228148786	
6	1	35395.17245409278	
7	1	139610.8543652314	
8	1	54813.50695725651	
9	1	12196.08391898407	
10	1	30489.46661003692	
11	0	3249.605291476977	
12	1	23547.66328082151	
12	1	23547.66328082151	
13	1	273483.60923594295	
13	1	273483.60923594295	
14	1	13434.655493289682	
14	1	13434.655493289682	
15	0	68341.33216316313	
15	0	68341.33216316313	
16	0	35448.04087759186	
16	0	35448.04087759186	
17	1	20178.985060062987	
17	1	20178.985060062987	
18	0	166655.8082740009	
19	0	9808.005933838096	
20	1	5145.156039682276	
21	1	8375.630784677518	
22	1	92721.847913788	
23	0	-32790.25112805041	
24	0	44093.97871506778	
25	0	47579.57903775951	
26	0	1642.7151702418	
27	1	20020.445463382228	
28	1	-287418.64574330836	
29	0	21648.22544335491	
30	0	17369.868542161905	
31	0	18927.67410536637	
32	1	50761.923417613354	
33	0	-34658.11494037311	
34	0	5087.99259683909	
35	0	150636.16325

#### Adding and deleting columns so DF

In [None]:
empty = pd.DataFrame()
empty["nums"] = [i for i in range(100)]


empty = empty.set_index("nums")


empty["more_nums"] = [i for i in range(100)]
empty.columns = ["numbers"]


del empty["numbers"]

empty.reset_index(inplace=True)
del empty["nums"]

Unnamed: 0,index
0,0
1,1
2,2
3,3
4,4
...,...
95,95
96,96
97,97
98,98


## Missed data

In [None]:
def get_regional_sale_data():
    df = DataFrame(data=[
        ['south', 'pc', 120, 73182],
        ['south', 'laptops', 150, 90873],
        ['south', 'tablets', 200, 65117],
        ['north', 'pc', 15, 71764],
        ['north', 'laptops', 18, 88234],
        ['north', 'tablets', 30, None],  # no data for price
        ['east', 'pc', 87, 71764],
        ['east', 'laptops', None, 89245],  # no data for quantity
        ['east', 'tablets', 112, 63485],
        ['west', 'pc', 16, 72245],
        ['west', 'laptops', 23, 85123],  # no data for quantity
        ['west', 'tablets', 14, 53213]
    ],
    columns=['region', 'product', 'quantity', 'price'])
    
    return df