# Pandas Extended Guide

## Creating DF and Series

In [None]:
# import pandas
import pandas as pd
import numpy as np

### creating Series

In [None]:
# creating series from list
ser = pd.Series(
    data=[3, 4, 5, 2], 
    name="Marks", 
    copy=True,
    index=["student1", "student2", "student3", "student4"], # taking all the rows
    dtype="float64"
    )
ser


In [None]:
#creating series from dict
data = {
    "row1" : 2323,
    "row2" : 808,
    "row3" : 2353,
}
ser1 = pd.Series(
    data, 
    dtype="string", 
    name="shity_column", 
    index=["row1", "row2"] # taking only some of the rows
)
ser1

In [None]:
# creating series from ndarray
arr = np.arange(1, 100)
pand = pd.Series(arr, name="series_from_ndarray", copy=True)
pand

### creating Data Frame

In [None]:
datum = np.arange(100).reshape(20, -1) # turning ndarray into dataframe
df_ndarr = pd.DataFrame(
    data= datum, 
    columns=["col1", "col2", "col3", "col4", "col5"],
    copy = True
)
df_ndarr

Unnamed: 0,col1,col2,col3,col4,col5
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [35]:
glos = {
    "col1" : [1, 4, 3, 5, 66, 34, 234, 76],
    "col2" : [1, 34, 3, 134, 66, 3454, 345, 2342],
    "col3" : [435, 4, 345345, 234, 66, 23423, 234, 76]
}

df_gloss = pd.DataFrame(
    glos, 
    columns=["col1", "col2"],
    index=[i for i in range(100, 108)]
)
df_gloss

Unnamed: 0,col1,col2
100,1,1
101,4,34
102,3,3
103,5,134
104,66,66
105,34,3454
106,234,345
107,76,2342


In [None]:
# creating data frame
df = pd.DataFrame(
    [i for i in range(100)],
    ['a' + str(i) for i in range(100)]
)
df

In [None]:
# creating named data frame
df = pd.DataFrame({
    "number": [i for i in range(100)],
    "char": ['a' + str(i) for i in range(100)]
})
df

In [None]:
df['number']

In [None]:
type(df['number'])

In [None]:
df[['number']]

In [None]:
type(df[['number']])

### index allignment

In [37]:
ser1 = pd.Series(name="col1", data=[i for i in range(1000, 1100)], index=[i for i in range(2000, 2100)])
ser2 = pd.Series(name="col2", data=[i for i in range(7000, 7100)], index=[i for i in range(2050, 2150)])
merged = pd.DataFrame({
    "col1": ser1,
    "col2": ser2
})
merged

Unnamed: 0,col1,col2
2000,1000.0,
2001,1001.0,
2002,1002.0,
2003,1003.0,
2004,1004.0,
...,...,...
2145,,7095.0
2146,,7096.0
2147,,7097.0
2148,,7098.0


## Reading data sources

### Reading CSV

In [43]:
new_df = pd.read_csv(
    filepath_or_buffer="math_students.csv", # path to the file
    sep=',', # the delimeter between data
    skiprows=0,
    parse_dates=["famsize"], # the cols that should be parsed into date
    nrows= 395,
    encoding="UTF-8",
)
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

  new_df = pd.read_csv(


### Writing into CSV

In [46]:
new_df.to_csv(
    path_or_buf="new_csv.csv",
    sep=",",
    columns=["school", "age"],
    header=["школа", "возраст"],
    index= True,
    index_label="index"
)

### Preparing SQL

In [48]:
import sqlite3 as sq

def create_dummy_table(db="data.db", path="./math_students.csv", table_name="data_table"):
    with sq.connect(db) as con:
        cur = con.cursor()
        df = pd.read_csv(path)
        df.to_sql(name=table_name, if_exists="replace", index=False, con=con)
sql_query = '''SELECT * FROM data_table'''
create_dummy_table()

### Reading SQL

In [None]:
with sq.connect("data.db") as con:
    sql_read_df = pd.read_sql(
        sql = sql_query,
        con = con
    )
sql_read_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


### Writing to SQL

In [None]:
with sq.connect("data.db") as con:
    sql_read_df.to_sql(
        con = con, 
        name="new_table_damn",
        if_exists="fail",
    )
sql_read_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


## Adressing to an element

In [None]:
int(df['number'][3]) # adressing by an index

In [None]:
df.loc[6:12, "char"] # adressing by rows and column names

In [None]:
df.loc[6:12, ["char", "number"]] # we can choose multiple rows and columns

In [None]:
df.iloc[3:6, 0:2] # takes only indexes of the rows and columns

In [None]:
df.iloc[3:6, 0:2] = "oh shit" # we can change the values in the fields
df.iloc[3:6, 0:2]

## Filtering data frame

### query

In [None]:
df.query('char == "oh shit"')

### loc

In [None]:
df.loc[df.char == "oh shit", :]

### apply

In [None]:
func = lambda x : x + "_supershit"
df.char.apply(func)

### numpy magic

In [None]:
df.char += "_metashit"
df.char