# Pandas Basics

- Import
- Access cols and rows
- Aggregate, filter, sort
- Infos
- generate pandas df from numpy array

In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv('../data/salaries.csv', index_col=0)
print(df)

        Salary  Age
Name               
John     50000   34
Sally   120000   45
Alyssa   80000   27


## Use Columns and Rows

In [29]:
print(df['Salary'])

Name
John       50000
Sally     120000
Alyssa     80000
Name: Salary, dtype: int64


In [30]:
print(df[['Age', 'Salary']].loc["John"])

Age          34
Salary    50000
Name: John, dtype: int64


In [31]:
print(df[['Age', 'Salary']].loc[["Sally", "John"]])

       Age  Salary
Name              
Sally   45  120000
John    34   50000


## Aggregation / Filters / Sort

In [32]:
print(df["Age"].mean())
print("\n")

print(df["Age"] > 30)
print("\n")

age_filter = df["Age"] > 30
print(df[age_filter])
print("\n")

35.333333333333336


Name
John       True
Sally      True
Alyssa    False
Name: Age, dtype: bool


       Salary  Age
Name              
John    50000   34
Sally  120000   45




In [33]:
# nice to check if null values (e.g. if only male / female)
print(df["Age"].nunique())

print(df["Age"].unique())

3
[34 45 27]


In [34]:
print(df.sort_values("Salary", ascending=False))

        Salary  Age
Name               
Sally   120000   45
Alyssa   80000   27
John     50000   34


## Infos

In [35]:
print(df.info())
print("\n")

print(df.describe())
print("\n")

print(df.columns)
print("\n")

print(df.index)
print("\n")

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, John to Alyssa
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Salary  3 non-null      int64
 1   Age     3 non-null      int64
dtypes: int64(2)
memory usage: 180.0+ bytes
None


              Salary        Age
count       3.000000   3.000000
mean    83333.333333  35.333333
std     35118.845843   9.073772
min     50000.000000  27.000000
25%     65000.000000  30.500000
50%     80000.000000  34.000000
75%    100000.000000  39.500000
max    120000.000000  45.000000


Index(['Salary', 'Age'], dtype='object')


Index(['John', 'Sally', 'Alyssa'], dtype='object', name='Name')




## Numpy to Pandas

In [36]:
mat = np.arange(0, 50).reshape(5, 10)
df_mat = pd.DataFrame(data=mat, index=["A", "B", "C", "D", "E"], columns=[
                      "a b c d e f g h i j".split(" ")])
print(df_mat)

    a   b   c   d   e   f   g   h   i   j
A   0   1   2   3   4   5   6   7   8   9
B  10  11  12  13  14  15  16  17  18  19
C  20  21  22  23  24  25  26  27  28  29
D  30  31  32  33  34  35  36  37  38  39
E  40  41  42  43  44  45  46  47  48  49
