In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Step 1: Data Cleaning
data = pd.read_csv('housing.csv')
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

imputer = SimpleImputer(strategy='mean')
data['longitude'] = imputer.fit_transform(data[['longitude']])

# Step 2: Data Integration (if necessary)
df_housing = pd.read_csv('housing.csv')
df_additional = pd.read_csv('new.csv')
merged_df = pd.merge(df_housing, df_additional, on='longitude', how='inner')
print('Merged datasets:\n', merged_df)

# Step 3: Data Transformation
categorical_cols = ['ocean_proximity']
data_transformed = pd.get_dummies(data, columns=categorical_cols, drop_first=True)  # Using pd.get_dummies for one-hot encoding
numerical_cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
scaler = StandardScaler()
data_transformed[numerical_cols] = scaler.fit_transform(data_transformed[numerical_cols])

# Save the processed data to a new CSV file
data_transformed.to_csv('processed_data.csv', index=False)
print(data_transformed)

Merged datasets:
     longitude  latitude_x  housing_median_age_x  total_rooms_x  \
0     -122.23       37.88                  41.0          880.0   
1     -122.23       37.84                  50.0         2515.0   
2     -122.23       37.84                  47.0         3175.0   
3     -122.23       37.85                  52.0         2800.0   
4     -122.23       37.80                  52.0         2033.0   
..        ...         ...                   ...            ...   
67    -122.23       40.20                  17.0          762.0   
68    -122.23       40.17                  21.0         1401.0   
69    -122.23       40.15                  14.0         2297.0   
70    -122.23       39.95                  21.0         2087.0   
71    -122.23       39.86                  21.0         1730.0   

    total_bedrooms_x  population_x  households_x  median_income_x  \
0              129.0         322.0         126.0           8.3252   
1              399.0         970.0         373.0   