In [9]:
# prompt: # Part 1: Detect & Handle Missing Data
# # Task 1: Detect Missing Data
# #     1. Load the Data:
# #     2. Detect Missing Data:
# # Task 2: Handle Missing Data by Dropping
# #     1. Drop Rows with Missing Values:
# # Task 3: Handle Missing Data by Imputation
# #     1. Fill Missing Values:

import pandas as pd
import numpy as np

# Part 1: Detect & Handle Missing Data

# Task 1: Detect Missing Data

# 1. Load the Data:
# Assuming your data is in a CSV file named 'your_data.csv'
try:
  data = pd.read_csv('your_data.csv')
except FileNotFoundError:
  # Create a sample DataFrame if the file is not found
  data = pd.DataFrame({
      'A': [1, 2, np.nan, 4, 5],
      'B': [6, np.nan, 8, 9, 10],
      'C': [11, 12, 13, np.nan, 15]
  })
  print("File not found. Using sample data.")

# 2. Detect Missing Data:
missing_data = data.isnull().sum()
print("Missing Values:\n", missing_data)


# Task 2: Handle Missing Data by Dropping

# 1. Drop Rows with Missing Values:
data_dropped = data.dropna()
print("\nDataFrame after dropping rows with missing values:\n", data_dropped)


# Task 3: Handle Missing Data by Imputation

# 1. Fill Missing Values:
# Fill missing numerical values with the mean
numerical_cols = data.select_dtypes(include=np.number).columns
data_filled_mean = data.copy()
for col in numerical_cols:
    data_filled_mean[col] = data_filled_mean[col].fillna(data_filled_mean[col].mean())

print("\nDataFrame after filling missing numerical values with mean:\n", data_filled_mean)

# Fill missing categorical values with the mode
categorical_cols = data.select_dtypes(exclude=np.number).columns
data_filled_mode = data.copy()
for col in categorical_cols:
    data_filled_mode[col] = data_filled_mode[col].fillna(data_filled_mode[col].mode()[0])

print("\nDataFrame after filling missing categorical values with mode:\n", data_filled_mode)


File not found. Using sample data.
Missing Values:
 A    1
B    1
C    1
dtype: int64

DataFrame after dropping rows with missing values:
      A     B     C
0  1.0   6.0  11.0
4  5.0  10.0  15.0

DataFrame after filling missing numerical values with mean:
      A      B      C
0  1.0   6.00  11.00
1  2.0   8.25  12.00
2  3.0   8.00  13.00
3  4.0   9.00  12.75
4  5.0  10.00  15.00

DataFrame after filling missing categorical values with mode:
      A     B     C
0  1.0   6.0  11.0
1  2.0   NaN  12.0
2  NaN   8.0  13.0
3  4.0   9.0   NaN
4  5.0  10.0  15.0


In [10]:
# prompt: # Part 2: Remove Duplicates & Fix Data Types
# # Task 1: Remove Duplicates
# #     1. Load Extended Data:
# #     2. Remove Duplicates:
# # Task 2: Fix Data Types
# #     1. Fix Incorrect Data Types:
# # Task 3: Convert Data Type for Analysis
# #     1. Convert Date Strings to DateTime:

import pandas as pd
# Part 2: Remove Duplicates & Fix Data Types

# Task 1: Remove Duplicates

# 1. Load Extended Data (Assuming data_filled_mean from Part 1)
# Using data_filled_mean as the starting point, you can replace it with your actual extended data if needed
extended_data = data_filled_mean.copy()

# 2. Remove Duplicates
data_no_duplicates = extended_data.drop_duplicates()
print("\nDataFrame after removing duplicates:\n", data_no_duplicates)


# Task 2: Fix Data Types

# 1. Fix Incorrect Data Types (Illustrative example, adapt to your data)
# Assuming 'A' should be an integer, and 'C' is a string
# Check if the columns exist before attempting to modify them
if 'A' in data_no_duplicates.columns:
    data_no_duplicates['A'] = data_no_duplicates['A'].astype(int)
if 'C' in data_no_duplicates.columns:
    data_no_duplicates['C'] = data_no_duplicates['C'].astype(str)


print("\nDataFrame after fixing data types:\n", data_no_duplicates)


# Task 3: Convert Data Type for Analysis

# 1. Convert Date Strings to DateTime (Illustrative example, adapt to your data)
# Assuming a column named 'Date' contains date strings in 'YYYY-MM-DD' format
if 'Date' in data_no_duplicates.columns:
    try:
        data_no_duplicates['Date'] = pd.to_datetime(data_no_duplicates['Date'], format='%Y-%m-%d', errors='coerce')
        print("\nDataFrame after converting 'Date' column to datetime:\n", data_no_duplicates)
    except KeyError:
        print("\n'Date' column not found in the DataFrame.")
else:
    print("\n'Date' column not found in the DataFrame.")



DataFrame after removing duplicates:
      A      B      C
0  1.0   6.00  11.00
1  2.0   8.25  12.00
2  3.0   8.00  13.00
3  4.0   9.00  12.75
4  5.0  10.00  15.00

DataFrame after fixing data types:
    A      B      C
0  1   6.00   11.0
1  2   8.25   12.0
2  3   8.00   13.0
3  4   9.00  12.75
4  5  10.00   15.0

'Date' column not found in the DataFrame.
