In [None]:

import pandas as pd
import numpy as np
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda', 'John', 'Anna'],
    'Age': ['28', 34, '42', 41, '28', 34],
    'City': ['New York', 'Seattle', 'Boston', 'Chicago', 'New York', 'Seattle'],
    'Salary': [75000, 65000, 80000, 92000, 75000, '65000']
}
df = pd.DataFrame(data)
df.to_csv('extended_data.csv', index=False)
print("Extended CSV file created with the following data:")
print(df)
print("\n")
df = pd.read_csv('extended_data.csv')
print("Data loaded from CSV:")
print(df)
print("\n")
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
print("\n")
print("Duplicate rows:")
print(df[df.duplicated(keep='first')])
print("\n")
df_no_duplicates = df.drop_duplicates(keep='first')
print("DataFrame after removing duplicates (keeping first occurrence):")
print(df_no_duplicates)
print("\n")
print("Removing duplicates based only on 'Name' and 'City' columns:")
df_subset_no_duplicates = df.drop_duplicates(subset=['Name', 'City'], keep='first')
print(df_subset_no_duplicates)
print("\n")
print("Current data types:")
print(df_no_duplicates.dtypes)
print("\n")
try:
    df_no_duplicates['Age'] = df_no_duplicates['Age'].astype(int)
except ValueError:
    print("Error converting Age directly. Using alternative method...")
    df_no_duplicates['Age'] = pd.to_numeric(df_no_duplicates['Age'], errors='coerce')
try:
    df_no_duplicates['Salary'] = df_no_duplicates['Salary'].astype(float)
except ValueError:
    print("Error converting Salary directly. Using alternative method...")
    df_no_duplicates['Salary'] = pd.to_numeric(df_no_duplicates['Salary'], errors='coerce')
print("DataFrame with corrected data types:")
print(df_no_duplicates)
print("\n")
print("Updated data types:")
print(df_no_duplicates.dtypes)
print("\n")
print("Summary statistics after type conversion:")
print(df_no_duplicates.describe())

Extended CSV file created with the following data:
    Name Age      City Salary
0   John  28  New York  75000
1   Anna  34   Seattle  65000
2  Peter  42    Boston  80000
3  Linda  41   Chicago  92000
4   John  28  New York  75000
5   Anna  34   Seattle  65000


Data loaded from CSV:
    Name  Age      City  Salary
0   John   28  New York   75000
1   Anna   34   Seattle   65000
2  Peter   42    Boston   80000
3  Linda   41   Chicago   92000
4   John   28  New York   75000
5   Anna   34   Seattle   65000


Number of duplicate rows: 2


Duplicate rows:
   Name  Age      City  Salary
4  John   28  New York   75000
5  Anna   34   Seattle   65000


DataFrame after removing duplicates (keeping first occurrence):
    Name  Age      City  Salary
0   John   28  New York   75000
1   Anna   34   Seattle   65000
2  Peter   42    Boston   80000
3  Linda   41   Chicago   92000


Removing duplicates based only on 'Name' and 'City' columns:
    Name  Age      City  Salary
0   John   28  New York   750

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates['Age'] = df_no_duplicates['Age'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates['Salary'] = df_no_duplicates['Salary'].astype(float)
