# Binning - qcut

In [10]:
import pandas as pd
import csv

data = pd.read_csv("PlayTennis.csv")
data.columns = data.columns.astype(str)

data['Numeric'] = range(len(data['Play']))
pd.qcut(data['Numeric'], q=4)

0     (-0.001, 3.25]
1     (-0.001, 3.25]
2     (-0.001, 3.25]
3     (-0.001, 3.25]
4        (3.25, 6.5]
5        (3.25, 6.5]
6        (3.25, 6.5]
7        (6.5, 9.75]
8        (6.5, 9.75]
9        (6.5, 9.75]
10      (9.75, 13.0]
11      (9.75, 13.0]
12      (9.75, 13.0]
13      (9.75, 13.0]
Name: Numeric, dtype: category
Categories (4, interval[float64]): [(-0.001, 3.25] < (3.25, 6.5] < (6.5, 9.75] < (9.75, 13.0]]

# Random Sampling

In [42]:
import random

l = list(range(10))
print(l)

# 随机选取一个元素
print(random.choice(l))

# 随机无放回选取（不重复）
print(random.sample(l, 5))

# 随机有放回选取（有重复）
print(random.choices(l, k=5))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
4
[3, 6, 7, 2, 8]
[3, 7, 8, 3, 7]


# Select multiple rows and columns using .loc

In [5]:
import pandas as pd

countries = pd.DataFrame({
'country': ['United States', 'The Netherlands', 'Spain', 'Mexico', 'Australia'],
'capital': ['Washington D.C.', 'Amsterdam', 'Madrid', 'Mexico City', 'Canberra'],
'continent': ['North America', 'Europe', 'Europe', 'North America', 'Australia'],
'language': ['English', 'Dutch', 'Spanish', 'Spanish', 'English']})

# Select all rows from column country to column continent
countries.loc[:, 'country':'continent']

Unnamed: 0,country,capital,continent
0,United States,Washington D.C.,North America
1,The Netherlands,Amsterdam,Europe
2,Spain,Madrid,Europe
3,Mexico,Mexico City,North America
4,Australia,Canberra,Australia


# Filter DataFrames by Category

In [2]:
import pandas as pd

countries = pd.DataFrame({
'country': ['United States', 'The Netherlands', 'Spain', 'Mexico', 'Australia'],
'capital': ['Washington D.C.', 'Amsterdam', 'Madrid', 'Mexico City', 'Canberra'],
'continent': ['North America', 'Europe', 'Europe', 'North America', 'Australia'],
'language': ['English', 'Dutch', 'Spanish', 'Spanish', 'English']})

countries[countries.continent == 'Europe']
countries[countries.language.isin(['Dutch', 'English'])]

Unnamed: 0,country,capital,continent,language
0,United States,Washington D.C.,North America,English
1,The Netherlands,Amsterdam,Europe,Dutch
4,Australia,Canberra,Australia,English


# Filter DataFrames by excluding categories

In [4]:
import pandas as pd

countries = pd.DataFrame({
'country': ['United States', 'The Netherlands', 'Spain', 'Mexico', 'Australia'],
'capital': ['Washington D.C.', 'Amsterdam', 'Madrid', 'Mexico City', 'Canberra'],
'continent': ['North America', 'Europe', 'Europe', 'North America', 'Australia'],
'language': ['English', 'Dutch', 'Spanish', 'Spanish', 'English']})

countries[~countries.continent.isin(['Europe'])]
countries[~countries.language.isin(['Dutch', 'English'])]

Unnamed: 0,country,capital,continent,language
2,Spain,Madrid,Europe,Spanish
3,Mexico,Mexico City,North America,Spanish


# Rename columns

In [5]:
import pandas as pd

countries = pd.DataFrame({
'country': ['United States', 'The Netherlands', 'Spain', 'Mexico', 'Australia'],
'capital': ['Washington D.C.', 'Amsterdam', 'Madrid', 'Mexico City', 'Canberra'],
'continent': ['North America', 'Europe', 'Europe', 'North America', 'Australia'],
'language': ['English', 'Dutch', 'Spanish', 'Spanish', 'English']})

countries.columns = ['country', 'capital_city', 'continent', 'most_spoken_language']

# Split a DataFrame into two random subsets

In [10]:
import pandas as pd

countries = pd.DataFrame({
'country': ['United States', 'The Netherlands', 'Spain', 'Mexico', 'Australia'],
'capital': ['Washington D.C.', 'Amsterdam', 'Madrid', 'Mexico City', 'Canberra'],
'continent': ['North America', 'Europe', 'Europe', 'North America', 'Australia'],
'language': ['English', 'Dutch', 'Spanish', 'Spanish', 'English']})

countries_1 = countries.sample(frac=0.6, random_state=999)
countries_2 = countries.drop(countries_1.index)

# Create dummy variables

In [12]:
import pandas as pd

students = pd.DataFrame({
'name': ['Ben', 'Tina', 'John', 'Eric'],
'gender': ['male', 'female', 'male', 'male']})

pd.get_dummies(students)

Unnamed: 0,name_Ben,name_Eric,name_John,name_Tina,gender_female,gender_male
0,1,0,0,0,0,1
1,0,0,0,1,1,0
2,0,0,1,0,0,1
3,0,1,0,0,0,1


# Check equality of columns

In [15]:
import pandas as pd

df = pd.DataFrame({'col_1': [1, 0], 'col_2': [0, 1], 'col_3': [1, 0]})

df['col_1'].equals(df['col_2'])
df['col_1'].equals(df['col_3'])

True

# Concatenate DataFrames

In [19]:
import pandas as pd

df = pd.DataFrame({'col_1': [1, 0], 'col_2': [0, 1], 'col_3': [1, 0]})
df_1 = pd.DataFrame({'col_1': [6, 7, 8], 'col_2': [1, 2, 3], 'col_3': [5, 6, 7]})
pd.concat([df, df_1]).reset_index(drop=True)

Unnamed: 0,col_1,col_2,col_3
0,1,0,1
1,0,1,0
2,6,1,5
3,7,2,6
4,8,3,7
