In [1]:
import pandas as pd

# Load your data
file_path = r'C:\Users\vishw\Downloads\archive (3)\AB_NYC_2019.csv'  # Raw string notation to handle backslashes
print(f"Attempting to load file from: {file_path}")

try:
    df = pd.read_csv(file_path)
    print("Initial dataset info:")
    print(df.info())
    
    # Data Type Checks
    print("\nData Types:")
    print(df.dtypes)

    # Range Checks (Example for a numerical column 'price')
    if df['price'].min() < 0:
        print("Error: Negative values found in the 'price' column.")

    # Format Checks (Example for a date column 'last_review')
    if 'last_review' in df.columns:
        df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
        if df['last_review'].isnull().any():
            print("Error: Invalid date format found in the 'last_review' column.")

    # Handle Missing Values
    missing_values = df.isnull().sum()
    print("\nMissing Values:\n", missing_values)
    
    # Fill missing values with mean for numerical columns and mode for categorical columns
    df.fillna(df.mean(), inplace=True)
    for column in df.select_dtypes(include=['object']).columns:
        df[column].fillna(df[column].mode()[0], inplace=True)

    # Remove Duplicates
    df.drop_duplicates(inplace=True)

    # Outlier Detection (Example for a numerical column 'price')
    q1 = df['price'].quantile(0.25)
    q3 = df['price'].quantile(0.75)
    iqr = q3 - q1
    outliers = df[(df['price'] < (q1 - 1.5 * iqr)) | (df['price'] > (q3 + 1.5 * iqr))]
    print(f"\nOutliers in 'price':\n{outliers}")

    # Consistency Checks (Example for a column 'neighbourhood_group')
    if df['neighbourhood_group'].str.contains('[^a-zA-Z ]').any():
        print("Error: Non-alphabetic characters found in the 'neighbourhood_group' column.")

    # Save Cleaned Data
    cleaned_file_path = r'C:\Users\vishw\Downloads\archive (3)\cleaned_AB_NYC_2019.csv'
    df.to_csv(cleaned_file_path, index=False)

    print("Data cleaning complete. Cleaned data saved to:", cleaned_file_path)

except FileNotFoundError:
    print(f"File not found: {file_path}. Please check the file path and try again.")
except ValueError as ve:
    print(f"ValueError: {ve}")
except Exception as e:
    print(f"An error occurred: {e}")


Attempting to load file from: C:\Users\vishw\Downloads\archive (3)\AB_NYC_2019.csv
Initial dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  i