In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/AB_NYC_2019.csv'
df = pd.read_csv(file_path)

df.fillna(value={'price': df['price'].mean()}, inplace=True)

df.drop_duplicates(subset=['id'], keep='first', inplace=True)

df['last_review'] = pd.to_datetime(df['last_review'])

cleaned_file_path = '/content/drive/MyDrive/AB_NYC_2019.csv'
df.to_csv(cleaned_file_path, index=False)

# Print summary information
print("Data cleaning completed.")
print("Summary Information:")
print(df.info())


Data cleaning completed.
Summary Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              48895 non-null  int64         
 1   name                            48879 non-null  object        
 2   host_id                         48895 non-null  int64         
 3   host_name                       48874 non-null  object        
 4   neighbourhood_group             48895 non-null  object        
 5   neighbourhood                   48895 non-null  object        
 6   latitude                        48895 non-null  float64       
 7   longitude                       48895 non-null  float64       
 8   room_type                       48895 non-null  object        
 9   price                           48895 non-null  int64         
 10  minimum_nights          

In [None]:
# Missing Data Handling: Dealing with missing values by either imputing them or making informed decisions on how to handle gaps in the dataset

In [None]:
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)

mean_price = df['price'].mean()
df['price'].fillna(mean_price, inplace=True)

df.dropna(subset=['last_review'], inplace=True)

cleaned_file_path = '/content/drive/MyDrive/AB_NYC_2019.csv'
df.to_csv(cleaned_file_path, index=False)

# Print summary information
print("Missing data handling completed.")
print("Summary Information:")
print(df.info())

Missing Values:
id                                 0
name                               6
host_id                            0
host_name                         16
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
dtype: int64
Missing data handling completed.
Summary Information:
<class 'pandas.core.frame.DataFrame'>
Index: 38843 entries, 0 to 48852
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              38843 non-null  int64         
 1   name        

In [None]:
# Duplicate Removal: Identifying and eliminating duplicate records to maintain data uniqueness

In [None]:
duplicate_rows = df[df.duplicated(subset=['id'], keep='first')]  # Change 'id' to the column(s) you want to check for duplicates
print("Duplicate Records:")
print(duplicate_rows)

# Remove duplicate records
df.drop_duplicates(subset=['id'], keep='first', inplace=True)  # Change 'id' to the column(s) you want to remove duplicates based on

cleaned_file_path = '/content/drive/MyDrive/AB_NYC_2019.csv'
df.to_csv(cleaned_file_path, index=False)

# Print summary information
print("Duplicate removal completed.")
print("Summary Information:")
print(df.info())


Duplicate Records:
Empty DataFrame
Columns: [id, name, host_id, host_name, neighbourhood_group, neighbourhood, latitude, longitude, room_type, price, minimum_nights, number_of_reviews, last_review, reviews_per_month, calculated_host_listings_count, availability_365]
Index: []
Duplicate removal completed.
Summary Information:
<class 'pandas.core.frame.DataFrame'>
Index: 38843 entries, 0 to 48852
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              38843 non-null  int64         
 1   name                            38837 non-null  object        
 2   host_id                         38843 non-null  int64         
 3   host_name                       38827 non-null  object        
 4   neighbourhood_group             38843 non-null  object        
 5   neighbourhood                   38843 non-null  object        
 6   latitude            

In [None]:
# Standardization: Consistent formatting and units across the dataset for accurate analysis.

In [None]:
df['price'] = df['price'] * 0.84
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
else:
    print("Column 'date' not found in the DataFrame.")

cleaned_file_path = '/content/drive/MyDrive/AB_NYC_2019.csv'
df.to_csv(cleaned_file_path, index=False)

# Print summary information
print("Standardization completed.")
print("Summary Information:")
print(df.info())


Column 'date' not found in the DataFrame.
Standardization completed.
Summary Information:
<class 'pandas.core.frame.DataFrame'>
Index: 38843 entries, 0 to 48852
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              38843 non-null  int64         
 1   name                            38837 non-null  object        
 2   host_id                         38843 non-null  int64         
 3   host_name                       38827 non-null  object        
 4   neighbourhood_group             38843 non-null  object        
 5   neighbourhood                   38843 non-null  object        
 6   latitude                        38843 non-null  float64       
 7   longitude                       38843 non-null  float64       
 8   room_type                       38843 non-null  object        
 9   price                           38843 non-null  float

In [None]:
# Outlier Detection: Identifying and addressing outliers that may skew analysis or model performance

In [None]:
import numpy as np

numeric_cols = df.select_dtypes(include=np.number).columns
z_scores = df[numeric_cols].apply(lambda x: (x - x.mean()) / x.std())

outlier_threshold = 3

outliers = df[(z_scores > outlier_threshold).any(axis=1)]

print("Outliers Identified:")
print(outliers)

print("Outlier detection completed.")
print("Summary Information:")
df.info()

Outliers Identified:
             id                                               name    host_id  \
3          3831                    Cozy Entire Floor of Brownstone       4869   
7          5178                   Large Furnished Room Near B'way        8967   
11         5441                    Central Manhattan/near Broadway       7989   
14         6090                      West Village Nest - Superhost      11975   
16         7097                  Perfect for Your Parents + Garden      17571   
...         ...                                                ...        ...   
48033  36041232          Nice house room 2 near van cortlandt park  230720704   
48126  36101190                                   Flushing Hideout   30839692   
48127  36101396                            7 minutes away from JFK   83974928   
48293  36182136                            House by the beach side  234781729   
48526  36309284  Private Room Near JFK, St John's Hospital, & B...  234090781   

      