In [1]:
import pandas as pd

# Create sample data with duplicates
data = {
    'name': ['Alice', 'Bob', 'Alice', 'Alice', 'David'],
    'age': [25, 30, 35, 25, 40],
    'city': ['New York', 'London', 'Paris', 'New York', 'Tokyo']
}
df = pd.DataFrame(data)

# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print("Duplicate rows:")
print(df[df.duplicated()])

Number of duplicate rows: 1
Duplicate rows:
    name  age      city
3  Alice   25  New York


In [2]:
df.columns

Index(['name', 'age', 'city'], dtype='object')

In [3]:
# Check for duplicates based on specific columns
print(f"Duplicates based on name: {df.duplicated(subset=['age']).sum()}")

Duplicates based on name: 1


**IMPORTANT**

In [4]:
# Remove duplicates
df_cleaned = df.drop_duplicates()
df_cleaned_subset = df.drop_duplicates(subset=['name']) # Keeps the first occurrence

# Keep last occurrence instead of first
df_cleaned_last = df.drop_duplicates(keep='last')

In [5]:
df_cleaned

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,London
2,Alice,35,Paris
4,David,40,Tokyo


In [6]:
df_cleaned_subset

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,London
4,David,40,Tokyo


In [7]:
df_cleaned.dtypes

name    object
age      int64
city    object
dtype: object

In [8]:
df['date_column'] = ['2023-01-15','aaa-10-15', '2025-03-23', '2023-02-20', '2023-03-10']

In [9]:
# df_cleaned['date_column'] = new_df['datee']

In [10]:
# Check data types
print(df.dtypes)

# Convert data types
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['date'] = pd.to_datetime(df['date_column'], errors='coerce')

name           object
age             int64
city           object
date_column    object
dtype: object


In [11]:
df

Unnamed: 0,name,age,city,date_column,date
0,Alice,25,New York,2023-01-15,2023-01-15
1,Bob,30,London,aaa-10-15,NaT
2,Alice,35,Paris,2025-03-23,2025-03-23
3,Alice,25,New York,2023-02-20,2023-02-20
4,David,40,Tokyo,2023-03-10,2023-03-10


In [12]:
unique_types = df['city'].apply(type).unique()
print(unique_types)

[<class 'str'>]


In [13]:
# Check for mixed types in columns
for column in df.columns:
    unique_types = df[column].apply(type).unique()
    if len(unique_types) > 1:
        print(f"Column '{column}' has mixed types: {unique_types}")

Column 'date' has mixed types: [<class 'pandas._libs.tslibs.timestamps.Timestamp'>
 <class 'pandas._libs.tslibs.nattype.NaTType'>]


In [21]:
# Create sample categorical data
categorical_data = {
    'color': ['Red', 'Blue', 'Green', 'Red', 'Blue'],
    'size': ['Small', 'Medium', 'Large', 'Medium', 'Small'],
    'rating': ['Good', 'Excellent', 'Poor', 'Good', 'Fair']
}
df_cat = pd.DataFrame(categorical_data)

In [22]:
# Label encoding (assigns integers to categories)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_cat['color_encoded'] = le.fit_transform(df_cat['color'])

In [16]:
df_cat.head()

Unnamed: 0,color,size,rating,color_encoded
0,Red,Small,Good,2
1,Blue,Medium,Excellent,0
2,Green,Large,Poor,1
3,Red,Medium,Good,2
4,Blue,Small,Fair,0


In [20]:
df_cat[['color','color_encoded']]

Unnamed: 0,color,color_encoded
0,Red,2
1,Blue,0
2,Green,1
3,Red,2
4,Blue,0


In [24]:
# One-hot encoding (creates binary columns for each category)
df_onehot = pd.get_dummies(df_cat, columns=['color', 'size'])
df_onehot

Unnamed: 0,rating,color_encoded,color_Blue,color_Green,color_Red,size_Large,size_Medium,size_Small
0,Good,2,False,False,True,False,False,True
1,Excellent,0,True,False,False,False,True,False
2,Poor,1,False,True,False,True,False,False
3,Good,2,False,False,True,False,True,False
4,Fair,0,True,False,False,False,False,True


In [None]:
df