In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle


In [9]:
from sklearn.datasets import fetch_california_housing

In [11]:
data=fetch_california_housing()


In [12]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [13]:
df=pd.DataFrame()

In [18]:
data.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [20]:
df=pd.DataFrame(data=data.data, columns=data.feature_names)
df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [22]:
df['House_price']=data.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,House_price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


EDA

In [25]:
!pip install sweetviz




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [26]:
import sweetviz as sv

In [27]:
report =sv.analyze(df)
report.show_html("./report.html")

                                             |          | [  0%]   00:00 -> (? left)

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


Data Pre-processing

In [29]:
!pip install geopy
from geopy.geocoders import Nominatim


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [30]:
geolocator = Nominatim(user_agent="House_price_prediction_system")
loc_update = {"County": [], "Road": [], "Neighbourhood": []}

In [31]:
geolocator.reverse("37.88"+" , "+"-122.23").raw['address']

{'road': 'Convict Trail',
 'city': 'Oakland',
 'county': 'Alameda County',
 'state': 'California',
 'ISO3166-2-lvl4': 'US-CA',
 'postcode': '94720',
 'country': 'United States',
 'country_code': 'us'}

In [None]:
# def location(cord):
    
#     try:
#         Latitude=str(cord[0])
#         Longitude=str(cord[1])
    
#         location=geolocator.reverse(Latitude +" , "+ Longitude).raw['address']
    
#         location = geolocator.reverse(Latitude + "," + Longitude).raw['address']
#         road = location.get('road', '')  # Get 'road' with empty default
#         county = location.get('county', '')  # Get 'county' with empty default
#         neighbourhood = location.get('neighbourhood', '')

#         loc_update['County'].append(county)
#         loc_update['Road'].append(road)
#         loc_update['Neighbourhood'].append(neighbourhood)

#     except Exception as e:
#         print(f"Error retrieving location: {e}")
#         loc_update['County'].append(None)
#         loc_update['Road'].append(None)
#         loc_update['Neighbourhood'].append(None)

 # print(f"Location found: {location}") #add this line to check location value.
 #    except Exception as e:
 #        print(f"Error geocoding {Latitude}, {Longitude}: {e}")
 #        loc_update['County'].append(None) #add none values to keep lists the same length.
 #        loc_update['Road'].append(None) #add none values to keep lists the same length.
def location(cord):
    Latitude=str(cord[0])
    Longitude=str(cord[1])
    location = geolocator.reverse(Latitude + "," + Longitude, timeout=5).raw['address']

    if location.get('Road')is None:
        location['Road']=None
    if location.get('County:')is None:
        location['County']=None 
    loc_update['County'].append(location['County'])
    loc_update['Road'].append(location['Road'])


In [None]:
# for i, cord in enumerate(df[['Latitude', 'Longitude']].values):
#     location(cord)

#     # Save every 100 iterations
#     if i % 100 == 0:
#         with open("House Price Prediction/loc_update.pickle", "wb") as f:
#             pickle.dump(loc_update, f)
#         print(f"Saved progress at record {i}")

# # Final save after loop ends
# with open("House Price Prediction/loc_update.pickle", "wb") as f:
#     pickle.dump(loc_update, f)
# print("Final pickle file saved.")
loc_update = {"County": [], "Road": [], "Neighbourhood": []}
for i, cord in enumerate(df.iloc[:, 6:-1].values):  # Assuming lat/lon in columns 6 and 7
    location(cord) 
    pickle.dump(loc_update,open('loc_update.pickle','wb'))
# Save after processing all records
# with open('loc_update.pickle', 'wb') as file:
    # pickle.dump(loc_update, file)
if i % 100 == 0:
    print(i)

In [None]:

loc_update = pickle.load(open("House Price Prediction/loc_update.pickle","rb"))


In [None]:
loc_update['County']

In [None]:
df.shape

