In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/sample_data/california_housing_test.csv')

# Display the first few rows of the dataframe
print(df.head())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359            330000.0  
4       850.0       237.0         2.9375             81700.0  


In [2]:
# Display basic information about the dataset
print(df.info())

# Display summary statistics of numerical columns
print(df.describe())

# Display the number of missing values in each column
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           3000 non-null   float64
 1   latitude            3000 non-null   float64
 2   housing_median_age  3000 non-null   float64
 3   total_rooms         3000 non-null   float64
 4   total_bedrooms      3000 non-null   float64
 5   population          3000 non-null   float64
 6   households          3000 non-null   float64
 7   median_income       3000 non-null   float64
 8   median_house_value  3000 non-null   float64
dtypes: float64(9)
memory usage: 211.1 KB
None
         longitude    latitude  housing_median_age   total_rooms  \
count  3000.000000  3000.00000         3000.000000   3000.000000   
mean   -119.589200    35.63539           28.845333   2599.578667   
std       1.994936     2.12967           12.555396   2155.593332   
min    -124.180000    32.56000      

In [3]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

# Drop rows with any missing values (if any)
df_cleaned = df.dropna()

# Remove duplicate rows (if any)
df_no_duplicates = df_cleaned.drop_duplicates()

print(df_no_duplicates.head())


longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359            330000.0  
4    

In [4]:
from sklearn.preprocessing import StandardScaler

# Normalize numerical columns (for example, 'median_income')
scaler = StandardScaler()
df_no_duplicates[['median_income']] = scaler.fit_transform(df_no_duplicates[['median_income']])

print(df_no_duplicates.head())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0       1.510745            344700.0  
1       809.0       277.0      -0.112324            176500.0  
2      1484.0       495.0       1.071149            270500.0  
3        49.0        11.0       1.255865            330000.0  
4       850.0       237.0      -0.469081             81700.0  


In [5]:
# Save the cleaned dataframe to a new CSV file
df_no_duplicates.to_csv('/content/cleaned_california_housing_test.csv', index=False)

# Confirm that the file is saved
!ls /content


cleaned_california_housing_test.csv  sample_data
