In [1]:
import pandas as pd

In [2]:
s = pd.Series([10,20,30,40,50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
s.dtype

dtype('int64')

In [4]:
s.values

array([10, 20, 30, 40, 50])

In [5]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
print(s.name)

None


In [7]:
s.name = "calories"

In [8]:
s

0    10
1    20
2    30
3    40
4    50
Name: calories, dtype: int64

In [9]:
# Indexing

s[0]

np.int64(10)

In [10]:
s[0:2] #start(included) : stop (excluded): step value (values to jump)

0    10
1    20
Name: calories, dtype: int64

In [11]:
s[2:4]

2    30
3    40
Name: calories, dtype: int64

In [12]:
# iloc -> location based indexing

s.iloc[3]

np.int64(40)

In [13]:
s.iloc[[1,3,4]]

1    20
3    40
4    50
Name: calories, dtype: int64

In [14]:
index = ["apple", "banana", "grapes", "orange", "strawberry"]

In [15]:
s.index = index
s

apple         10
banana        20
grapes        30
orange        40
strawberry    50
Name: calories, dtype: int64

In [16]:
s['grapes']

np.int64(30)

In [17]:
s.iloc[3]

np.int64(40)

In [18]:
#loc -> label based indexing.
s.loc[['grapes', 'apple']]

grapes    30
apple     10
Name: calories, dtype: int64

In [19]:
# In label based indexing your start as well as stop value both are included in the output.
s['banana':'orange']

banana    20
grapes    30
orange    40
Name: calories, dtype: int64

In [20]:
fruit_protein = {
    "Avocado": 2.0,       # grams of protein
    "Guava": 2.6,
    "Blackberries": 2.0,
    "Oranges": 0.9,
    "Banana": 1.1,
    "Apples": 0.3,
    "Kiwi": 1.1,
    "Pomegranate": 1.7,
    "Mango": 0.8,
    "Cherries": 1.0
}


In [21]:
s2 = pd.Series(fruit_protein, name = "Protein")
s2

Avocado         2.0
Guava           2.6
Blackberries    2.0
Oranges         0.9
Banana          1.1
Apples          0.3
Kiwi            1.1
Pomegranate     1.7
Mango           0.8
Cherries        1.0
Name: Protein, dtype: float64

In [22]:
# Conditional Selection:

s2[s2>1]

Avocado         2.0
Guava           2.6
Blackberries    2.0
Banana          1.1
Kiwi            1.1
Pomegranate     1.7
Name: Protein, dtype: float64

In [23]:
# Logical Operators: and, or, not

s2[(s2>0.5) | (s2 <= 2)]

Avocado         2.0
Guava           2.6
Blackberries    2.0
Oranges         0.9
Banana          1.1
Apples          0.3
Kiwi            1.1
Pomegranate     1.7
Mango           0.8
Cherries        1.0
Name: Protein, dtype: float64

In [24]:
# not operation:

s2[~(s2>1)]

Oranges     0.9
Apples      0.3
Mango       0.8
Cherries    1.0
Name: Protein, dtype: float64

In [25]:
# Modyfying the series:

s2["Mango"] = 2.8

In [26]:
s2

Avocado         2.0
Guava           2.6
Blackberries    2.0
Oranges         0.9
Banana          1.1
Apples          0.3
Kiwi            1.1
Pomegranate     1.7
Mango           2.8
Cherries        1.0
Name: Protein, dtype: float64

In [27]:
import numpy as np

In [28]:
ser = pd.Series(['a', np.nan, 1, np.nan, 2])
s.notnull().sum()

np.int64(5)

In [29]:
# DataFrame:

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Alice"],
    "Age": [25, 30, 35, np.nan, 29, 25],
    "Department": ["HR", "IT", "Finance", "IT", "HR", "HR"],
    "Salary": [50000, 60000, 70000, 62000, np.nan, 50000]
}


In [30]:
data

{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice'],
 'Age': [25, 30, 35, nan, 29, 25],
 'Department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'HR'],
 'Salary': [50000, 60000, 70000, 62000, nan, 50000]}

In [31]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [32]:
df.head(2)

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0


In [33]:
df.tail(3)

Unnamed: 0,Name,Age,Department,Salary
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [34]:
# loc and iloc

df.iloc[1:3, :2] #rows, columns

Unnamed: 0,Name,Age
1,Bob,30.0
2,Charlie,35.0


In [35]:
df.loc[1:2, ["Age","Department"]]

Unnamed: 0,Age,Department
1,30.0,IT
2,35.0,Finance


In [36]:
df[["Age", "Department"]]

Unnamed: 0,Age,Department
0,25.0,HR
1,30.0,IT
2,35.0,Finance
3,,IT
4,29.0,HR
5,25.0,HR


In [37]:
df.drop("Age", axis = 1)

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000.0
1,Bob,IT,60000.0
2,Charlie,Finance,70000.0
3,David,IT,62000.0
4,Eve,HR,
5,Alice,HR,50000.0


In [38]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [39]:
df.shape

(6, 4)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        6 non-null      object 
 1   Age         5 non-null      float64
 2   Department  6 non-null      object 
 3   Salary      5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 324.0+ bytes


In [41]:
df.describe()

Unnamed: 0,Age,Salary
count,5.0,5.0
mean,28.8,58400.0
std,4.147288,8532.291603
min,25.0,50000.0
25%,25.0,50000.0
50%,29.0,60000.0
75%,30.0,62000.0
max,35.0,70000.0


In [42]:
# Broadcasting

df["Salary"] = df["Salary"] + 5000

In [43]:
df["Salary"]

0    55000.0
1    65000.0
2    75000.0
3    67000.0
4        NaN
5    55000.0
Name: Salary, dtype: float64

In [44]:
# Renaming cokumns:

df.rename(columns = {"Department": "Dept"}, inplace = True)

In [45]:
df

Unnamed: 0,Name,Age,Dept,Salary
0,Alice,25.0,HR,55000.0
1,Bob,30.0,IT,65000.0
2,Charlie,35.0,Finance,75000.0
3,David,,IT,67000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,55000.0


In [46]:
df["Dept"].unique()

array(['HR', 'IT', 'Finance'], dtype=object)

In [47]:
df["Dept"].unique()


array(['HR', 'IT', 'Finance'], dtype=object)

In [48]:
df["Promoted Salary"] = df["Salary"] * 10

In [49]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [50]:
# Data Cleaning

df.isnull().sum()

Name               0
Age                1
Dept               0
Salary             1
Promoted Salary    1
dtype: int64

In [51]:
df.dropna(how = "any") # any row that had any null value

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
5,Alice,25.0,HR,55000.0,550000.0


In [52]:
df.dropna(how = "all") #if all the values in any row are null then we drop that row

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [53]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [54]:
df["Age"].fillna(df["Age"].mean())

0    25.0
1    30.0
2    35.0
3    28.8
4    29.0
5    25.0
Name: Age, dtype: float64

In [55]:
df["Salary"].fillna(df["Salary"].median())

0    55000.0
1    65000.0
2    75000.0
3    67000.0
4    65000.0
5    55000.0
Name: Salary, dtype: float64

In [57]:
df["Age"].fillna(method = "ffill")

  df["Age"].fillna(method = "ffill")


0    25.0
1    30.0
2    35.0
3    35.0
4    29.0
5    25.0
Name: Age, dtype: float64

In [58]:
df["Age"].fillna(method = "bfill")

  df["Age"].fillna(method = "bfill")


0    25.0
1    30.0
2    35.0
3    29.0
4    29.0
5    25.0
Name: Age, dtype: float64

In [59]:
df["Name"] = df["Name"].replace("Charlie", "Rose")

In [60]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [61]:
# Duplicates:

df_dup = df[df.duplicated(keep = "last")]
df_dup

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0


In [62]:
df = df.drop_duplicates()

In [63]:
df


Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,


In [64]:
# invalid values:
# Lambda -> python

df["Promoted Salary"] = df["Promoted Salary"].apply(lambda x: x/10 if x > 650000 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Promoted Salary"] = df["Promoted Salary"].apply(lambda x: x/10 if x > 650000 else x)


In [65]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,75000.0
3,David,,IT,67000.0,67000.0
4,Eve,29.0,HR,,


In [72]:
name = "alice_fernandes"
df[["first_name", "last_name"]] = df["name"].str.split("_")

KeyError: 'name'

In [None]:
# apply and lambda
def multiplying_age(x):
  return x*2

df["Age"] = df["Age"].apply(multiplying_age)

In [71]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,50.0,HR,55000.0,550000.0
1,Bob,60.0,IT,65000.0,650000.0
2,Rose,70.0,Finance,75000.0,75000.0
3,David,,IT,67000.0,67000.0
4,Eve,58.0,HR,,


In [73]:
df["Age"] = df["Age"].apply(lambda x: x/2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].apply(lambda x: x/2)


In [74]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,75000.0
3,David,,IT,67000.0,67000.0
4,Eve,29.0,HR,,


In [75]:
# Joins and Merges

department_info = {
    "Dept": ["HR", "IT", "Finance"],
    "Location": ["New York", "San Francisco", "Chicago"],
    "Manager": ["Laura", "Steve", "Nina"]
}

df2 = pd.DataFrame(department_info)
df2

Unnamed: 0,Dept,Location,Manager
0,HR,New York,Laura
1,IT,San Francisco,Steve
2,Finance,Chicago,Nina


In [76]:
# Concat

pd.concat([df, df2])

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,Location,Manager
0,Alice,25.0,HR,55000.0,550000.0,,
1,Bob,30.0,IT,65000.0,650000.0,,
2,Rose,35.0,Finance,75000.0,75000.0,,
3,David,,IT,67000.0,67000.0,,
4,Eve,29.0,HR,,,,
0,,,HR,,,New York,Laura
1,,,IT,,,San Francisco,Steve
2,,,Finance,,,Chicago,Nina


In [77]:
pd.concat([df, df2], axis = 1)

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,Dept.1,Location,Manager
0,Alice,25.0,HR,55000.0,550000.0,HR,New York,Laura
1,Bob,30.0,IT,65000.0,650000.0,IT,San Francisco,Steve
2,Rose,35.0,Finance,75000.0,75000.0,Finance,Chicago,Nina
3,David,,IT,67000.0,67000.0,,,
4,Eve,29.0,HR,,,,,


In [78]:
pd.merge(df, df2, on = "Dept")

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,Location,Manager
0,Alice,25.0,HR,55000.0,550000.0,New York,Laura
1,Bob,30.0,IT,65000.0,650000.0,San Francisco,Steve
2,Rose,35.0,Finance,75000.0,75000.0,Chicago,Nina
3,David,,IT,67000.0,67000.0,San Francisco,Steve
4,Eve,29.0,HR,,,New York,Laura
