In [None]:
import pandas as pd
import numpy as np

# --- Task 1: Remove Duplicates ---
# 1. Load Extended Data:
data_extended = {'col1': [1, 2, 2, 3, 4, 4, np.nan],
                 'col2': ['A', 'B', 'B', 'C', 'D', 'D', np.nan],
                 'col3': [10.0, 20.0, 20.0, 30.0, 40.0, 40.0, np.nan]}
df_extended = pd.DataFrame(data_extended)
print("Original Extended DataFrame:")
print(df_extended)

# 2. Remove Duplicates:
print("\n--- Removing Duplicates ---")

# Identify duplicate rows
duplicate_rows = df_extended[df_extended.duplicated()]
print("\nDuplicate rows:")
print(duplicate_rows)

# Remove duplicate rows
df_no_duplicates = df_extended.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)
print(f"Shape of original DataFrame: {df_extended.shape}")
print(f"Shape of DataFrame after removing duplicates: {df_no_duplicates.shape}")

# Remove duplicates based on a subset of columns
df_no_duplicates_subset = df_extended.drop_duplicates(subset=['col1', 'col2'])
print("\nDataFrame after removing duplicates based on 'col1' and 'col2':")
print(df_no_duplicates_subset)

print("-" * 30)

# --- Task 2: Fix Data Types ---
# 1. Fix Incorrect Data Types:
print("\n--- Fixing Data Types ---")
##
data_types = {'col_int_str': ['1', '2', '3', '4'],
              'col_float_str': ['1.1', '2.2', '3.3', '4.4'],
              'col_bool_str': ['True', 'False', 'TRUE', 'FALSE']}
df_types = pd.DataFrame(data_types)
print("\nDataFrame with incorrect data types:")
print(df_types.dtypes)
print(df_types)

# Convert 'col_int_str' to integer
df_types['col_int'] = pd.to_numeric(df_types['col_int_str'], errors='coerce', downcast='integer')
print("\nDataFrame after converting 'col_int_str' to integer:")
print(df_types.dtypes)
print(df_types)

# Convert 'col_float_str' to float
df_types['col_float'] = pd.to_numeric(df_types['col_float_str'], errors='coerce')
print("\nDataFrame after converting 'col_float_str' to float:")
print(df_types.dtypes)
print(df_types)

# Convert 'col_bool_str' to boolean (case-insensitive)
df_types['col_bool'] = df_types['col_bool_str'].str.lower() == 'true'
print("\nDataFrame after converting 'col_bool_str' to boolean:")
print(df_types.dtypes)
print(df_types)

print("-" * 30)

# --- Task 3: Convert Data Type for Analysis ---
# 1. Convert Date Strings to DateTime:
print("\n--- Converting Date Strings to DateTime ---")

data_dates_str = {'date_str': ['2023-01-15', '2023/02/20', '03-10-2024', '2024.04.05']}
df_dates_str = pd.DataFrame(data_dates_str)
print("\nDataFrame with date strings:")
print(df_dates_str)
print(df_dates_str.dtypes)

# Convert 'date_str' to datetime objects (pandas can often infer the format)
df_dates_datetime = pd.to_datetime(df_dates_str['date_str'], errors='coerce')
print("\nSeries after converting 'date_str' to datetime:")
print(df_dates_datetime)
print(df_dates_datetime.dtypes)

# Convert with a specific format if inference fails or for consistency
df_dates_datetime_format = pd.to_datetime(df_dates_str['date_str'], format='%Y-%m-%d', errors='coerce')
print("\nSeries after converting 'date_str' with specific format '%Y-%m-%d':")
print(df_dates_datetime_format)

print("-" * 30)

Original Extended DataFrame:
   col1 col2  col3
0   1.0    A  10.0
1   2.0    B  20.0
2   2.0    B  20.0
3   3.0    C  30.0
4   4.0    D  40.0
5   4.0    D  40.0
6   NaN  NaN   NaN

--- Removing Duplicates ---

Duplicate rows:
   col1 col2  col3
2   2.0    B  20.0
5   4.0    D  40.0

DataFrame after removing duplicates:
   col1 col2  col3
0   1.0    A  10.0
1   2.0    B  20.0
3   3.0    C  30.0
4   4.0    D  40.0
6   NaN  NaN   NaN
Shape of original DataFrame: (7, 3)
Shape of DataFrame after removing duplicates: (5, 3)

DataFrame after removing duplicates based on 'col1' and 'col2':
   col1 col2  col3
0   1.0    A  10.0
1   2.0    B  20.0
3   3.0    C  30.0
4   4.0    D  40.0
6   NaN  NaN   NaN
------------------------------

--- Fixing Data Types ---

DataFrame with incorrect data types:
col_int_str      object
col_float_str    object
col_bool_str     object
dtype: object
  col_int_str col_float_str col_bool_str
0           1           1.1         True
1           2           2.2     