# Pandas Extended Guide

## Creating DF and Series

In [None]:
# import pandas
import pandas as pd
import numpy as np

### creating Series

In [None]:
# creating series from list
ser = pd.Series(
    data=[3, 4, 5, 2], 
    name="Marks", 
    copy=True,
    index=["student1", "student2", "student3", "student4"], # taking all the rows
    dtype="float64"
    )
ser


In [149]:
#creating series from dict
data = {
    "row1" : 2323,
    "row2" : 808,
    "row3" : 2353,
}
ser1 = pd.Series(
    data, 
    dtype="string", 
    name="shity_column", 
    index=["row1", "row2"] # taking only some of the rows
)
ser1

row1    2323
row2     808
Name: shity_column, dtype: string

In [None]:
# creating series from ndarray
arr = np.arange(1, 100)
pand = pd.Series(arr, name="series_from_ndarray", copy=True)
pand

### creating Data Frame

In [None]:
datum = np.arange(100).reshape(20, -1) # turning ndarray into dataframe
df_ndarr = pd.DataFrame(
    data= datum, 
    columns=["col1", "col2", "col3", "col4", "col5"],
    copy = True
)
df_ndarr

Unnamed: 0,col1,col2,col3,col4,col5
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [35]:
glos = {
    "col1" : [1, 4, 3, 5, 66, 34, 234, 76],
    "col2" : [1, 34, 3, 134, 66, 3454, 345, 2342],
    "col3" : [435, 4, 345345, 234, 66, 23423, 234, 76]
}

df_gloss = pd.DataFrame(
    glos, 
    columns=["col1", "col2"],
    index=[i for i in range(100, 108)]
)
df_gloss

Unnamed: 0,col1,col2
100,1,1
101,4,34
102,3,3
103,5,134
104,66,66
105,34,3454
106,234,345
107,76,2342


In [None]:
# creating data frame
df = pd.DataFrame(
    [i for i in range(100)],
    ['a' + str(i) for i in range(100)]
)
df

In [None]:
# creating named data frame
df = pd.DataFrame({
    "number": [i for i in range(100)],
    "char": ['a' + str(i) for i in range(100)]
})
df

In [None]:
df['number']

In [None]:
type(df['number'])

In [None]:
df[['number']]

In [None]:
type(df[['number']])

### index allignment

In [37]:
ser1 = pd.Series(name="col1", data=[i for i in range(1000, 1100)], index=[i for i in range(2000, 2100)])
ser2 = pd.Series(name="col2", data=[i for i in range(7000, 7100)], index=[i for i in range(2050, 2150)])
merged = pd.DataFrame({
    "col1": ser1,
    "col2": ser2
})
merged

Unnamed: 0,col1,col2
2000,1000.0,
2001,1001.0,
2002,1002.0,
2003,1003.0,
2004,1004.0,
...,...,...
2145,,7095.0
2146,,7096.0
2147,,7097.0
2148,,7098.0


## Reading data sources

### Reading CSV

In [43]:
new_df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
    parse_dates=["famsize"], # the cols that should be parsed into date
    nrows= 395,
    encoding="UTF-8",
)
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

  new_df = pd.read_csv(


### Writing into CSV

In [46]:
new_df.to_csv(
    path_or_buf="new_csv.csv",
    sep=",",
    columns=["school", "age"],
    header=["школа", "возраст"],
    index= True,
    index_label="index"
)

### Preparing SQL

In [48]:
import sqlite3 as sq

def create_dummy_table(db="data.db", path="./math_students.csv", table_name="data_table"):
    with sq.connect(db) as con:
        cur = con.cursor()
        df = pd.read_csv(path)
        df.to_sql(name=table_name, if_exists="replace", index=False, con=con)
sql_query = '''SELECT * FROM data_table'''
create_dummy_table()

### Reading SQL

In [None]:
with sq.connect("data.db") as con:
    sql_read_df = pd.read_sql(
        sql = sql_query,
        con = con
    )
sql_read_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


### Writing to SQL

In [None]:
with sq.connect("data.db") as con:
    sql_read_df.to_sql(
        con = con, 
        name="new_table_damn",
        if_exists="fail",
    )
sql_read_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


## Writing to Python objects

In [None]:
# preparing data
df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
)

ser = pd.Series(
    np.arange(100),
    name = "nums",
    copy= True,
)

#### Series

In [None]:
ser.to_list() # returns list 
type(ser.to_list()) # list


ser.to_dict()
type(ser.to_dict()) # python dict


ser.to_numpy() # nd array

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

#### DataFrame

In [None]:
df.to_dict() # python dict of dicts, columns <-> keys
type(df.to_dict())


df.to_dict(orient="records") # list of dicts, ready to be converted to JSON


df.to_numpy() # getting an nd_array of rows


array([['GP', 'F', 18, ..., 5, 6, 6],
       ['GP', 'F', 17, ..., 5, 5, 6],
       ['GP', 'F', 15, ..., 7, 8, 10],
       ...,
       ['MS', 'M', 21, ..., 10, 8, 7],
       ['MS', 'M', 18, ..., 11, 12, 10],
       ['MS', 'M', 19, ..., 8, 9, 9]], shape=(395, 33), dtype=object)

## Indexes

In [None]:
index_list = list("abirvalg")
index_data = pd.Index(index_list, name="rows")
type(index_data)

pandas.core.indexes.base.Index

### Getting indexes

In [None]:
new_df = pd.DataFrame(
    data= np.arange(100).reshape(10, 10),
    copy=True
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [56]:
received_index = new_df.index
received_index

RangeIndex(start=0, stop=10, step=1)

In [57]:
received_columns = new_df.columns
received_columns

RangeIndex(start=0, stop=10, step=1)

### Changing indexes

In [59]:
new_df.index = pd.Index(list(range(10, 20)))
new_df.index

Index([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], dtype='int64')

### Turning indexes into numpy or lists

In [62]:
listus = new_df.index.to_list()
type(listus)

list

In [63]:
nump = new_df.index.to_numpy()
type(nump)

numpy.ndarray

### Unique values

In [None]:
df = pd.read_csv(
    filepath_or_buffer= "math_students.csv",
    delimiter= ",",
)
index_data = df.index
column_index_data = df.columns
index_data.unique() # RangeIndex(start=0, stop=395, step=1)
index_data.nunique() # number of unique elements 395
index_data.duplicated() # boolean list showing wether the element repeats

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [69]:
index_data.name = "new index name"
column_index_data.name = "new column name"
df

new column name,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
new index name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


### Renaming columns

In [None]:
df.rename(
    columns={
        "school":"школа"
    }
)
# this method returns a new data frame

new column name,школа,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
new index name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


### Gaps in indexes

In [None]:
ind = pd.Index([0, 1, np.nan, 4, np.nan, np.nan], name = "row indexes")
df_nan = pd.DataFrame(
    data = df[:][:5],
    index=ind,
)
df_nan.index.hasnans # does index have nans?

True

In [77]:
df_nan.index.isna() # does index have nans?

array([False, False,  True, False,  True,  True])

In [79]:
df_nan.index.dropna()

Index([0.0, 1.0, 4.0], dtype='float64', name='row indexes')

## Access to data

In [88]:
df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
    parse_dates=["famsize"], # the cols that should be parsed into date
    nrows= 395,
    encoding="UTF-8",
)

df.index.rename("index", inplace=True)
df.set_index(
    ["school"],
    append = True,
    inplace= True
)
df

  df = pd.read_csv(


Unnamed: 0_level_0,Unnamed: 1_level_0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
index,school,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,course,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,course,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,course,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,course,...,4,4,1,3,4,5,0,11,12,10


### deleting index

In [194]:
df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
    parse_dates=["famsize"], # the cols that should be parsed into date
    nrows= 395,
    encoding="UTF-8",
)
df.set_index("school", inplace=True)
df.reset_index()

  df = pd.read_csv(


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


### Getting data by index

In [95]:
df["school"] # returns series

index
0      GP
1      GP
2      GP
3      GP
4      GP
       ..
390    MS
391    MS
392    MS
393    MS
394    MS
Name: school, Length: 395, dtype: object

In [98]:
df[["school"]] # returns DataFrame

Unnamed: 0_level_0,school
index,Unnamed: 1_level_1
0,GP
1,GP
2,GP
3,GP
4,GP
...,...
390,MS
391,MS
392,MS
393,MS


In [105]:
df["school"] = "Engineering school"
df[["new_column"]] = "some_info"
df["new_new_column"] = df["age"]
df

Unnamed: 0_level_0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,goout,Dalc,Walc,health,absences,G1,G2,G3,new_column,new_new_column
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Engineering school,F,18,U,GT3,A,4,4,at_home,teacher,...,4,1,1,3,6,5,6,6,some_info,18
1,Engineering school,F,17,U,GT3,T,1,1,at_home,other,...,3,1,1,3,4,5,5,6,some_info,17
2,Engineering school,F,15,U,LE3,T,1,1,at_home,other,...,2,2,3,3,10,7,8,10,some_info,15
3,Engineering school,F,15,U,GT3,T,4,2,health,services,...,2,1,1,5,2,15,14,15,some_info,15
4,Engineering school,F,16,U,GT3,T,3,3,other,other,...,2,1,2,5,4,6,10,10,some_info,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Engineering school,M,20,U,LE3,A,2,2,services,services,...,4,4,5,4,11,9,9,9,some_info,20
391,Engineering school,M,17,U,LE3,T,3,1,services,services,...,5,3,4,2,3,14,16,16,some_info,17
392,Engineering school,M,21,R,GT3,T,1,1,other,other,...,3,3,3,3,3,10,8,7,some_info,21
393,Engineering school,M,18,R,LE3,T,3,2,services,other,...,1,3,4,5,0,11,12,10,some_info,18


### deleting column from DataFrame

In [108]:
df["new_new_column"] = df["age"]
del df["new_new_column"]
df

Unnamed: 0_level_0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,new_column
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Engineering school,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,some_info
1,Engineering school,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,some_info
2,Engineering school,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,some_info
3,Engineering school,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,some_info
4,Engineering school,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,some_info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Engineering school,M,20,U,LE3,A,2,2,services,services,...,5,4,4,5,4,11,9,9,9,some_info
391,Engineering school,M,17,U,LE3,T,3,1,services,services,...,4,5,3,4,2,3,14,16,16,some_info
392,Engineering school,M,21,R,GT3,T,1,1,other,other,...,5,3,3,3,3,3,10,8,7,some_info
393,Engineering school,M,18,R,LE3,T,3,2,services,other,...,4,1,3,4,5,0,11,12,10,some_info


### using boolean mask on data frame

In [None]:
df[:3][[True, True, False]] # returns only first and second rows

# we can also change values like that

Unnamed: 0_level_0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,new_column
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Engineering school,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,some_info
1,Engineering school,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,some_info


### combining rows and columns selection

In [122]:
df[23:45]["Mjob"]

index
23       other
24    services
25    services
26       other
27      health
28    services
29     teacher
30      health
31    services
32     teacher
33       other
34       other
35       other
36     teacher
37       other
38    services
39     at_home
40       other
41     teacher
42    services
43    services
44       other
Name: Mjob, dtype: object

### another way of getting columns

In [123]:
df.school

index
0      Engineering school
1      Engineering school
2      Engineering school
3      Engineering school
4      Engineering school
              ...        
390    Engineering school
391    Engineering school
392    Engineering school
393    Engineering school
394    Engineering school
Name: school, Length: 395, dtype: object

### Getting data from series by index

In [176]:
ser = df["age"]
ser.name = "ages"
ser[1] # getting single element

ser[1:4] # getting a slice

1    17
2    15
3    15
Name: ages, dtype: int64

## .loc() and .iloc()

### Series

In [178]:
ser = df["age"]
ser.loc[1:10]

1     17
2     15
3     15
4     16
5     16
6     16
7     17
8     15
9     15
10    15
Name: ages, dtype: int64

In [179]:
ser.iloc[3:12]

3     15
4     16
5     16
6     16
7     17
8     15
9     15
10    15
11    15
Name: ages, dtype: int64

### DataFrame

In [None]:
df.loc[3] # returns the 4th row as an object

df.loc[1:3] # returns rows with indexes 1-3 as a dataframe

df.loc[1:3, ["school", "age"]] # returns rows with indexes 1-3 and column school as a series

df.loc[1:3, "school":"age"] # using slices to pick rows and columns

Unnamed: 0,school,sex,age
1,GP,F,17
2,GP,F,15
3,GP,F,15


In [None]:
df.iloc[3] # third row

df.iloc[3:9] # rows from 3 to 9

df.iloc[2:5, 4:9] # now we can get columns by their indexes

Unnamed: 0,famsize,Pstatus,Medu,Fedu,Mjob
2,LE3,T,1,1,at_home
3,GT3,T,4,2,health
4,GT3,T,3,3,other


## Iterating through keys and values

### series

In [None]:
ser = pd.Series(
    np.random.randint(1, 100, 100),
    name = "nums",
)
ser.keys() # returns the keys of the dict <-> series

ser.values # returns the values as a ndarray 

array([76, 38,  6, 64, 81, 16, 18, 72, 71, 44,  4, 50, 49, 38, 13,  6,  2,
       18, 15, 43, 77,  6, 51, 91, 68, 14, 55, 82, 44, 40, 55, 89, 44, 87,
       76, 16,  7, 33, 95, 66, 95, 59, 92, 34, 79, 48, 63,  3, 21, 70, 26,
       30, 42, 41, 79, 37,  3, 73, 93, 18, 47, 91, 48, 17, 76, 27, 59, 64,
       78, 84, 51, 49, 49, 46, 44, 28, 71, 49, 38, 30, 24, 30, 74, 37, 79,
       87, 71, 62, 28, 40, 71, 95, 76, 44, 13, 99, 10,  3, 33, 93])

### data frame

In [None]:
df.keys() # returns all the column labels

<bound method DataFrame.isetitem of        sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
school                                                                    
GP       F   18       U     GT3       A     4     4   at_home   teacher   
GP       F   17       U     GT3       T     1     1   at_home     other   
GP       F   15       U     LE3       T     1     1   at_home     other   
GP       F   15       U     GT3       T     4     2    health  services   
GP       F   16       U     GT3       T     3     3     other     other   
...     ..  ...     ...     ...     ...   ...   ...       ...       ...   
MS       M   20       U     LE3       A     2     2  services  services   
MS       M   17       U     LE3       T     3     1  services  services   
MS       M   21       R     GT3       T     1     1     other     other   
MS       M   18       R     LE3       T     3     2  services     other   
MS       M   19       U     LE3       T     1     1     other   

## Multiindex

In [124]:
df

Unnamed: 0_level_0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,new_column
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Engineering school,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,some_info
1,Engineering school,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,some_info
2,Engineering school,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,some_info
3,Engineering school,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,some_info
4,Engineering school,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,some_info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Engineering school,M,20,U,LE3,A,2,2,services,services,...,5,4,4,5,4,11,9,9,9,some_info
391,Engineering school,M,17,U,LE3,T,3,1,services,services,...,4,5,3,4,2,3,14,16,16,some_info
392,Engineering school,M,21,R,GT3,T,1,1,other,other,...,5,3,3,3,3,3,10,8,7,some_info
393,Engineering school,M,18,R,LE3,T,3,2,services,other,...,4,1,3,4,5,0,11,12,10,some_info


In [129]:
data = [[111, 222, 333, 444], ["ind1", "ind2", "ind3", "ind4"]] # length should be equal
mult_i = pd.MultiIndex.from_arrays(arrays=data, names=["level0", "level1"])
mult_i

MultiIndex([(111, 'ind1'),
            (222, 'ind2'),
            (333, 'ind3'),
            (444, 'ind4')],
           names=['level0', 'level1'])

In [127]:
mdf = pd.DataFrame(np.arange(16).reshape(4, 4), columns=mult_i)
mdf

level0,111,222,333,444
level1,ind1,ind2,ind3,ind4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [133]:
import pandas as pd

df = pd.DataFrame({
    'year': [2023, 2023, 2024],
    'quarter': ['Q1', 'Q2', 'Q1'],
    'value': [100, 150, 200]
})
df

Unnamed: 0,year,quarter,value
0,2023,Q1,100
1,2023,Q2,150
2,2024,Q1,200


In [132]:
df_multi = df.set_index(['year', 'quarter'])
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,value
year,quarter,Unnamed: 2_level_1
2023,Q1,100
2023,Q2,150
2024,Q1,200


### from_arrays

In [135]:
df_multi.index = pd.MultiIndex.from_arrays(
    [[2023, 2023, 2024], ['Q1', 'Q2', 'Q1']],
    names=['year', 'quarter']
)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,value
year,quarter,Unnamed: 2_level_1
2023,Q1,100
2023,Q2,150
2024,Q1,200


### from_tuples

In [137]:
df_multi.index = pd.MultiIndex.from_tuples(
    [(2023, 'Q1'), (2023, 'Q2'), (2024, 'Q1')],
    names=['year', 'quarter']
)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,value
year,quarter,Unnamed: 2_level_1
2023,Q1,100
2023,Q2,150
2024,Q1,200


### set_index

In [147]:
df = pd.DataFrame({
    'year': [2023, 2023, 2024],
    'quarter': ['Q1', 'Q2', 'Q1'],
    'value': [100, 150, 200]
})
df_multi = df.set_index(['year', 'quarter'])
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,value
year,quarter,Unnamed: 2_level_1
2023,Q1,100
2023,Q2,150
2024,Q1,200


### set_levels()

In [148]:
# Пусть у нас уже есть MultiIndex
index = pd.MultiIndex.from_tuples([(2023, 'Q1'), (2023, 'Q2'), (2024, 'Q1')], names=['year', 'quarter'])
df = pd.DataFrame({'value': [100, 150, 200]}, index=index)

# Меняем уровень 0 (year) с 2023,2023,2024 на 1,1,2
df.index = df.index.set_levels([ [1, 2], ['Q1', 'Q2'] ])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
year,quarter,Unnamed: 2_level_1
1,Q1,100
1,Q2,150
2,Q1,200


## Condiditon select

### Condition select on Series

In [None]:
s = pd.Series(
    data={
        "USA":28,
        "Russia":12,
        "China":10,
        "Korea":4,
        "Germany":2
    }
)

mask = s > 7 # getting boolean mask on the series
s[mask] # getting only those rows that -> true


USA       28
Russia    12
China     10
dtype: int64

### Condition select on DataFrame

In [162]:
df = pd.read_csv(
    filepath_or_buffer="./math_students.csv"
)
mask = ((df["age"] > 15) & (df["failures"] > 2))
df[mask]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
18,GP,M,17,U,GT3,T,3,2,services,services,...,5,5,5,2,4,5,16,6,5,5
78,GP,M,17,U,GT3,T,2,1,other,other,...,4,5,1,1,1,3,2,8,8,10
127,GP,F,19,U,GT3,T,0,1,at_home,other,...,3,4,2,1,1,5,2,7,8,9
144,GP,M,17,U,GT3,T,2,1,other,other,...,5,4,5,1,2,5,0,5,0,0
150,GP,M,18,U,LE3,T,1,1,other,other,...,2,3,5,2,5,4,0,6,5,0
153,GP,M,19,U,GT3,T,3,2,services,at_home,...,4,5,4,1,1,4,0,5,0,0
157,GP,F,18,R,GT3,T,1,1,at_home,other,...,5,2,5,1,5,4,6,9,8,10
164,GP,M,17,R,LE3,T,1,1,other,services,...,5,3,5,1,5,5,0,5,8,7
173,GP,F,16,U,GT3,T,1,3,at_home,services,...,4,3,5,1,1,3,0,8,7,0
206,GP,F,16,U,GT3,A,3,1,services,other,...,2,3,3,2,2,4,5,7,7,7


## Filtering data frame

### query

In [None]:
df.query('char == "oh shit"')

### loc

In [None]:
df.loc[df.char == "oh shit", :]

### apply

In [None]:
func = lambda x : x + "_supershit"
df.char.apply(func)

### numpy magic

In [None]:
df.char += "_metashit"
df.char

## Operation with series

#### simple operations

In [219]:
score = pd.Series(
    np.random.randint(100000, 2000000, 100),
    name="scores"
)

score = (score / score.max()) * 100
score.name = "score in percent"
score

0     66.671234
1     40.253624
2     92.777560
3     19.782518
4     66.207709
        ...    
95    51.845091
96    56.080451
97    75.378333
98    32.955216
99    76.529279
Name: score in percent, Length: 100, dtype: float64

#### applying functions

In [223]:
score = pd.Series(
    np.random.randint(100000, 2000000, 100),
    name="scores"
)


powerize = lambda x : x**2
score.apply(powerize) # just applying lambda to all of the elements

0       74095562025
1      111343677124
2     3015335006784
3      819023190001
4     3871650457801
          ...      
95    2341206010000
96    1664959250889
97     250937877969
98     181768353649
99     277031690244
Name: scores, Length: 100, dtype: int64

In [None]:
score = pd.Series(
    np.random.randint(100000, 2000000, 100),
    name="scores"
)


score.max()

score.min()

score.mean() # average

score.median() # median value

score.var() # variation

score.std() # standart decline

score.sum() # sum of the series

np.int64(102640271)

#### multyplying series

In [233]:
usd_sales = pd.Series(np.random.randint(1, 100, 4), index=["jan", "feb", "march", "march"])

usd_to_rub = pd.Series(np.random.randint(80, 110, 12), index=["jan", "feb", "march", "april", "may", "june", "july", "august", "sept", "oct", "nov", "dec"])

sales_in_rub = usd_sales * usd_to_rub

sales_in_rub.dropna() # result without NaN

feb      1512.0
jan      1040.0
march     679.0
march    3686.0
dtype: float64

#### .multyply()

In [None]:
sales_in_rub = usd_sales.multiply(usd_to_rub, fill_value=0)

sales_in_rub

april        0.0
august       0.0
dec          0.0
feb       1512.0
jan       1040.0
july         0.0
june         0.0
march      679.0
march     3686.0
may          0.0
nov          0.0
oct          0.0
sept         0.0
dtype: float64

## Series dtype

In [None]:
ser = pd.Series(np.arange(100))

ser.dtype # dtype('int64')

ser = pd.Series([1, 3, "3"])

ser.dtype # dtype('O') <-> object

ser = pd.Series([1, 3, np.nan])

ser.dtype # dtype('float64')

ser = pd.Series([1, 3, pd.NA])

ser.dtype # dtype('O') <-> object

dtype('O')

#### Convertation

In [None]:
ser = pd.Series([1, 2, 3])
ser = ser.astype("float64")  # → float


pd.Series(['1', '2', '3']).astype(int)        # → int64
pd.Series([1.0, 2.0, 3.5]).astype(int)        # → floor round


pd.Series(['2020-01-01', '2021-01-01']).astype('datetime64[ns]')


pd.Series(['low', 'medium', 'high']).astype('category')


# nullable types

s = pd.Series([1, None])
s.astype("Int64")
s.astype("int64")      # raises an error

##### Type
<=>
##### Nullable type
int
'Int64'

float
'Float64'

bool
'boolean'

str
'string'

## Deleting Series row

In [None]:
usd_to_rub = pd.Series(np.random.randint(80, 110, 12), index=["jan", "feb", "march", "april", "may", "june", "july", "august", "sept", "oct", "nov", "dec"])

del usd_to_rub["feb"] # deletes single row

print("feb" in usd_to_rub.keys()) # False

# deletes multiple rows
usd_to_rub.drop(["march", "april", "may"], inplace=True)
usd_to_rub

jan       103
june       87
july       97
august     80
sept      102
oct        97
nov        81
dec       104
dtype: int64

38 08