# isin
The next pretty useful function is called isin. It is applied to an entire column and is very useful in selecting specific rows

In [1]:
# isin in python
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Orange', 'Grape'],
        'Quantity': [10, 5, 15, 20, 8]}

df = pd.DataFrame(data)

fruits_to_select = ['Apple', 'Cherry', 'Grape']

filtered_df = df[df['Fruit'].isin(fruits_to_select)]
print(filtered_df)

    Fruit  Quantity
0   Apple        10
2  Cherry        15
4   Grape         8


In [2]:
# using Series
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Orange', 'Grape'],
        'Quantity': [10, 5, 15, 20, 8]}

df = pd.DataFrame(data)

fruits_to_select = pd.Series(['Apple', 'Cherry', 'Grape'])

filtered_df = df[df['Fruit'].isin(fruits_to_select)]
print(filtered_df)

    Fruit  Quantity
0   Apple        10
2  Cherry        15
4   Grape         8


In [3]:
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Orange', 'Grape'],
        'Color': ['Red', 'Yellow', 'Red', 'Orange', 'Purple'],
        'Quantity': [10, 5, 15, 20, 8]}

df = pd.DataFrame(data)

fruits_to_select = ['Apple', 'Cherry', 'Grape']
colors_to_select = ['Red', 'Purple']

filtered_df = df[df['Fruit'].isin(fruits_to_select) & df['Color'].isin(colors_to_select)]
print(filtered_df)

    Fruit   Color  Quantity
0   Apple     Red        10
2  Cherry     Red        15
4   Grape  Purple         8


In [6]:
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Orange', 'Grape'],
        'Quantity': [10, 5, 15, 20, 8]}

df = pd.DataFrame(data)

fruits_to_exclude = ['Banana', 'Orange']

filtered_df = df[~df['Fruit'].isin(fruits_to_exclude)]
print(filtered_df)

    Fruit  Quantity
0   Apple        10
2  Cherry        15
4   Grape         8


# drop_duplicates
This one is a pretty useful function in a lot of respects, and it works on more than one column

In [7]:
# Drop duplicates
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Apple', 'Banana'],
        'Quantity': [10, 5, 15, 10, 5]}

df = pd.DataFrame(data)

# Remove duplicate rows based on all columns
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)

    Fruit  Quantity
0   Apple        10
1  Banana         5
2  Cherry        15


In [8]:
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Apple', 'Banana'],
        'Color': ['Red', 'Yellow', 'Red', 'Red', 'Yellow'],
        'Quantity': [10, 5, 15, 10, 5]}

df = pd.DataFrame(data)

# Remove duplicate rows based on 'Fruit' and 'Color' columns
df_no_duplicates = df.drop_duplicates(subset=['Fruit', 'Color'])
print(df_no_duplicates)

    Fruit   Color  Quantity
0   Apple     Red        10
1  Banana  Yellow         5
2  Cherry     Red        15


In [9]:
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Apple', 'Banana'],
        'Quantity': [10, 5, 15, 10, 5]}

df = pd.DataFrame(data)

# Keep the first occurrence and remove subsequent duplicates
df_no_duplicates = df.drop_duplicates(keep='first')
print(df_no_duplicates)

    Fruit  Quantity
0   Apple        10
1  Banana         5
2  Cherry        15


In [10]:
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Apple', 'Banana'],
        'Quantity': [10, 5, 15, 10, 5]}

df = pd.DataFrame(data)

# Keep the last occurrence and remove previous duplicates
df_no_duplicates = df.drop_duplicates(keep='last')
print(df_no_duplicates)

    Fruit  Quantity
2  Cherry        15
3   Apple        10
4  Banana         5


# cut
This will cut your numeric data into equal buckets and then assign them labels depending on the bucket. Pretty useful and if you need something more granular you can use qcut.

In [11]:
import pandas as pd

data = {'Age': [25, 35, 45, 55, 65, 75, 85]}
df = pd.DataFrame(data)

# Create bins (age categories) for the 'Age' column
bins = [0, 30, 60, 90]
labels = ['Young', 'Middle-aged', 'Senior']

df['Age_Category'] = pd.cut(df['Age'], bins=bins, labels=labels)
print(df)

   Age Age_Category
0   25        Young
1   35  Middle-aged
2   45  Middle-aged
3   55  Middle-aged
4   65       Senior
5   75       Senior
6   85       Senior


In [13]:
import pandas as pd

data = {'Height (inches)': [60, 63, 67, 71, 74, 77]}
df = pd.DataFrame(data)

# Create custom bins for 'Height' column
custom_bins = [60, 65, 70, 75, 80]

df['Height_Category'] = pd.cut(df['Height (inches)'], bins=custom_bins)
print(df)

   Height (inches) Height_Category
0               60             NaN
1               63    (60.0, 65.0]
2               67    (65.0, 70.0]
3               71    (70.0, 75.0]
4               74    (70.0, 75.0]
5               77    (75.0, 80.0]


In [15]:
import pandas as pd

data = {'Score': [70, 80, 90, 95, 100]}
df = pd.DataFrame(data)

# Create right-inclusive bins for 'Score' column
bins = [0, 80, 90, 100]

df['Score_Category'] = pd.cut(df['Score'], bins=bins, right=False)
print(df)

   Score Score_Category
0     70    [0.0, 80.0)
1     80   [80.0, 90.0)
2     90  [90.0, 100.0)
3     95  [90.0, 100.0)
4    100            NaN


In [None]:
import pandas as pd

data = {'Score': [70, 80, 90, 95, 100]}
df = pd.DataFrame(data)

# Create right-inclusive bins for 'Score' column
bins = [0, 80, 90, 100]

df['Score_Category'] = pd.cut(df['Score'], bins=bins, right=False)
print(df)

# corr
Calculate correlation. Pretty straightforward

In [24]:
import pandas as pd

# Create a sample DataFrame
data = {'A': [1, 2, 3, 4, 5],
        'B': [5, 4, 3, 2, 1],
        'C': [2, 3, 2, 3, 4]}

df = pd.DataFrame(data)

# Calculate the correlation matrix between columns
correlation_matrix = df.corr()

print(correlation_matrix)

          A         B         C
A  1.000000 -1.000000  0.755929
B -1.000000  1.000000 -0.755929
C  0.755929 -0.755929  1.000000


In [26]:
import pandas as pd

data = {'Score': [85, 90, 78, 92, 88]}
df = pd.DataFrame(data)

# Calculate the rank of each entry in the 'Score' column
df['Rank'] = df['Score'].rank()
print(df)

   Score  Rank
0     85   2.0
1     90   4.0
2     78   1.0
3     92   5.0
4     88   3.0


In [27]:
import pandas as pd

data = {'Score': [85, 90, 78, 92, 90]}
df = pd.DataFrame(data)

# Calculate the rank of each entry in the 'Score' column using the 'min' method
df['Rank_Min'] = df['Score'].rank(method='min')

# Calculate the rank of each entry in the 'Score' column using the 'max' method
df['Rank_Max'] = df['Score'].rank(method='max')

# Calculate the rank of each entry in the 'Score' column using the 'first' method
df['Rank_First'] = df['Score'].rank(method='first')

print(df)

   Score  Rank_Min  Rank_Max  Rank_First
0     85       2.0       2.0         2.0
1     90       3.0       4.0         3.0
2     78       1.0       1.0         1.0
3     92       5.0       5.0         5.0
4     90       3.0       4.0         4.0


# rename
Rename while not completely needed, is a nice convienience funtion. You can rename columns or indexes.

In [28]:
import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

# Rename column 'A' to 'X' and column 'B' to 'Y'
df = df.rename(columns={'A': 'X', 'B': 'Y'})
print(df)

   X  Y
0  1  4
1  2  5
2  3  6


In [30]:
df

Unnamed: 0,X,Y
0,1,4
1,2,5
2,3,6


In [32]:
import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [33]:
# Rename the index to 'Row 1', 'Row 2', 'Row 3'
df = df.rename(index={0: 'Row 1', 1: 'Row 2', 2: 'Row 3'})
print(df)

       A  B
Row 1  1  4
Row 2  2  5
Row 3  3  6


In [34]:
import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

# Rename column 'A' to 'X' and column 'B' to 'Y',
# and rename the index to 'Row 1', 'Row 2', 'Row 3'
df = df.rename(columns={'A': 'X', 'B': 'Y'}, index={0: 'Row 1', 1: 'Row 2', 2: 'Row 3'})
print(df)

       X  Y
Row 1  1  4
Row 2  2  5
Row 3  3  6


# itertuples
There are a couple of iteraters for dataframes. I would very much so caution you to not use these unless you are really sure that you know what you are doing. These are not very fast compared to many functions, but when working with a small dataframe this can be really useful.

In [35]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35]}

df = pd.DataFrame(data)

# Using itertuples to iterate over rows
for row in df.itertuples():
    print(f"Name: {row.Name}, Age: {row.Age}")

Name: Alice, Age: 25
Name: Bob, Age: 30
Name: Charlie, Age: 35
