Using semi-supervised learning to generate pseudo labels for predicting house price with regression 

In [3]:
import pandas as pd 

from sklearn.model_selection import train_test_split 

from sklearn.linear_model import LinearRegression 


import numpy as np 

In [4]:
# Load the data 

labeled_data = pd.read_csv("housing 5.csv") 

In [5]:
labeled_data.drop('ocean_proximity', axis=1, inplace=True)

In [6]:
labeled_data.dropna(inplace=True)

In [7]:
labeled_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [8]:
labeled_data.shape

(20433, 9)

In [9]:
 

# Split the labeled data into training and testing sets 

train_data, test_data, train_labels, test_labels = train_test_split(labeled_data.drop('median_house_value', axis=1), labeled_data['median_house_value'], test_size=0.2) 

In [10]:
train_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
15191,-117.05,33.02,18.0,917.0,121.0,388.0,131.0,6.3517
12240,-116.96,33.75,35.0,3269.0,757.0,2328.0,705.0,2.5898
17557,-121.89,37.33,42.0,1279.0,358.0,1254.0,340.0,2.2583
14387,-117.23,32.75,21.0,2050.0,608.0,1131.0,550.0,2.4779
9086,-118.19,34.65,33.0,1781.0,326.0,913.0,314.0,3.9963


In [11]:
test_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
7135,-118.1,34.01,42.0,1436.0,298.0,1005.0,298.0,3.4297
14058,-117.13,32.74,52.0,1512.0,321.0,651.0,321.0,3.6852
18973,-122.03,38.24,16.0,1104.0,164.0,495.0,156.0,5.4074
270,-122.18,37.78,33.0,142.0,31.0,575.0,47.0,3.875
17906,-121.96,37.36,16.0,5040.0,1325.0,3150.0,1196.0,4.2837


In [12]:
train_labels.head()

15191    260100.0
12240     76300.0
17557    192500.0
14387    165000.0
9086     126800.0
Name: median_house_value, dtype: float64

In [13]:
train_labels.shape

(16346,)

In [14]:
test_labels.head()

7135     195800.0
14058    185300.0
18973    157700.0
270      225000.0
17906    264500.0
Name: median_house_value, dtype: float64

In [15]:
 

# Train a linear regression model on the labeled data 

regressor = LinearRegression() 

regressor.fit(train_data, train_labels) 

In [16]:
 

# Use the trained model to predict the labels of the unlabeled data 

predicted_labels = regressor.predict(test_data) 

In [17]:
# Evaluate the performance of the  model on the test data 

score = regressor.score(test_data, test_labels) 

print("R^2 Score: ", score) 

R^2 Score:  0.624186740765541


In [18]:
predicted_labels

array([199119.33657977, 249558.46240335, 237124.60821986, ...,
       147804.54787427, 190650.25141633, 193903.17176697])

In [19]:
 

# Combine the labeled and newly predicted(psuedo labeled) data 

new_data = pd.concat([train_data, test_data], ignore_index=True) 

new_data['price'] = pd.concat([train_labels, pd.Series(predicted_labels)], ignore_index=True) 

In [20]:
new_data.tail

<bound method NDFrame.tail of        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -117.05     33.02                18.0        917.0           121.0   
1        -116.96     33.75                35.0       3269.0           757.0   
2        -121.89     37.33                42.0       1279.0           358.0   
3        -117.23     32.75                21.0       2050.0           608.0   
4        -118.19     34.65                33.0       1781.0           326.0   
...          ...       ...                 ...          ...             ...   
20428    -118.09     34.15                52.0        670.0           141.0   
20429    -118.21     34.13                52.0       2465.0           611.0   
20430    -120.80     37.52                13.0       2920.0           481.0   
20431    -118.05     33.93                31.0        894.0           203.0   
20432    -118.10     34.57                 7.0      20377.0          4335.0   

       population  ho

In [21]:
# Train a new model on the combined data 

new_train_data, new_test_data, new_train_labels, new_test_labels = train_test_split(new_data.drop('price', axis=1), new_data['price'], test_size=0.2) 

new_regressor = LinearRegression() 

new_regressor.fit(new_train_data, new_train_labels) 

In [22]:
# Evaluate the performance of the new model on the test data 

score = new_regressor.score(new_test_data, new_test_labels) 

print("R^2 Score: ", score) 

R^2 Score:  0.6905783112767134


In summary, a high R-squared score can be an indication of a good model, but it's crucial to interpret it within the context of the problem, alongside other relevant metrics, and to avoid potential pitfalls like overfitting.