### Imports

In [45]:
import pandas as pd
import os
import seaborn as sns

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

### Concatenate 10 datasets

In [3]:
# current working directory
path = os.getcwd()

# show all files in current working directory
files = os.listdir(path)
files

['.ipynb_checkpoints',
 'dataframe_albuquerque.xlsx',
 'dataframe_colorado.xlsx',
 'dataframe_indianapolis.xlsx',
 'dataframe_las_vegas.xlsx',
 'dataframe_miami.xlsx',
 'dataframe_new_york.xlsx',
 'dataframe_philadelphia.xlsx',
 'dataframe_san_diego.xlsx',
 'dataframe_san_francisco.xlsx',
 'dataframe_washington.xlsx',
 'RealEstateDataCleaning.ipynb']

In [4]:
files_needed = ['dataframe_albuquerque.xlsx',
 'dataframe_colorado.xlsx',
 'dataframe_indianapolis.xlsx',
 'dataframe_las_vegas.xlsx',
 'dataframe_miami.xlsx',
 'dataframe_new_york.xlsx',
 'dataframe_philadelphia.xlsx',
 'dataframe_san_diego.xlsx',
 'dataframe_san_francisco.xlsx',
 'dataframe_washington.xlsx']

In [5]:
len(files_needed)

10

In [7]:
# initialize dataframe
df_combined = pd.DataFrame()

for f in files_needed:
    df_combined = df_combined.append(pd.read_excel(f, 'Sheet1'))

In [8]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
855,1715 Euclid St NW,6 Beds,,,1906,Open Parking,"$1,900,000",Washington
856,2015 Q St NW,6 Beds,,"5,285 sqft",1925,2 Parking Spaces,"$2,750,000",Washington
857,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
858,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


# Data Cleaning Process

- Check for missing Data 

- Reset/Change Index number (e.g. row number)

- Check Doubles

---

- Change Price Column

- Price Data : Cut "$" in Price

- Price Data : Cut "," in Price
---

- Change Area Column

- Area Data : Cut "sqft" string element

- Area Data : Cut "," string element
---

- Bedrooms Data : Cut 'Beds' string element

- Bedrooms Data: "Studio" = 1 room

- Bathrooms Data : Cut 'Baths' string element
---

- Parking Data : yes/no
---

- Convert all numbers to integers
---

- New Column "price/sqft"
- Check for Outliers
- Save to Excel


### Check for missing data

In [9]:
df_combined.info()
# data missing for beds, baths, etc, looks complete for Address, Location

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8755 entries, 0 to 859
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     8755 non-null   object
 1   Bedrooms    8255 non-null   object
 2   Bathrooms   8145 non-null   object
 3   Area        8020 non-null   object
 4   Year Built  8333 non-null   object
 5   Parking     8263 non-null   object
 6   Price       8752 non-null   object
 7   Location    8755 non-null   object
dtypes: object(8)
memory usage: 615.6+ KB


In [11]:
# check for at least 1 missing value in a row
df_combined[df_combined.isna().any(axis = 1)]

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
370,Pajarito Mesa,,,,,,"$13,000",Albuquerque
376,1033 Forrester Ave NW,Studio,,600 sqft,1977,No Info,"$205,000",Albuquerque
400,1920 Broadway Blvd SE,2 Beds,1 Bath,,1962,No Info,"$245,000",Albuquerque
402,1990 1/2 Cherokee Rd NW,,,,,,"$175,000",Albuquerque
404,1301 Iron Ave SW,2 Beds,2 Baths,,1916,No Info,"$355,000",Albuquerque
...,...,...,...,...,...,...,...,...
798,235 Emerson St NW #101,1 Bed,1 Bath,,1929,Open Parking,"$209,900",Washington
817,2308 Tracy Pl NW,6 Beds,6 Baths,,1919,Garage,"$4,888,000",Washington
818,2017 Rear 2 St NE,,,,,,"$295,000",Washington
855,1715 Euclid St NW,6 Beds,,,1906,Open Parking,"$1,900,000",Washington


In [16]:
# check for missing values in a heat map
sns.heatmap(df_combined.isna(), yticklabels = False, cbar = False, cmap = 'BuGn')
# unsure why heatmap not visualizing...

<AxesSubplot:>

In [17]:
# drop all entries where 1 value (in a row) is missing
# need to restate variable in order to propery overwrite
df_combined = df_combined.dropna()

In [18]:
# now all rows have full data
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7826 entries, 0 to 859
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     7826 non-null   object
 1   Bedrooms    7826 non-null   object
 2   Bathrooms   7826 non-null   object
 3   Area        7826 non-null   object
 4   Year Built  7826 non-null   object
 5   Parking     7826 non-null   object
 6   Price       7826 non-null   object
 7   Location    7826 non-null   object
dtypes: object(8)
memory usage: 550.3+ KB


###  Reset/Change Index

In [19]:
# since all files were combined the row numbers are not consecutive through the dataframe
# now we fix that
df_combined = df_combined.reset_index(drop = True)

In [20]:
# now there is a unique index for each row
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
7821,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
7822,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
7823,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
7824,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


###  Check for Outliers and Doubles

In [21]:
# check for duplicates, outputs number of duplicates
df_combined.duplicated().sum()

1580

In [22]:
df_combined.loc[df_combined.duplicated(), :]
# can additionally cross-check via manual "find all" of example address in raw data file

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
80,1829 Illinois St NE,5 Beds,2 Baths,"1,797 sqft",1954,1 Car Garage,"$225,000",Albuquerque
81,9109 Crestwood Ave NE,3 Beds,3 Baths,"2,183 sqft",1963,2 Car Garage,"$320,000",Albuquerque
82,3401 Cagua Dr NE,3 Beds,2 Baths,"1,831 sqft",1955,2 Carport Spaces,"$269,000",Albuquerque
83,1325 Hertz Dr SE,3 Beds,3 Baths,"3,335 sqft",1975,2 Car Garage,"$435,000",Albuquerque
84,1020 Indiana St SE,4 Beds,2 Baths,"1,950 sqft",1953,Garage,"$299,900",Albuquerque
...,...,...,...,...,...,...,...,...
7748,525 Water St SW #420,3 Beds,3 Baths,"1,724 sqft",2016,2 Car Garage,"$1,375,000",Washington
7749,1032 Lamont St NW #A,3 Beds,3 Baths,"1,464 sqft",1907,Open Parking,"$749,000",Washington
7750,2403 Savannah St SE,3 Beds,2 Baths,"1,405 sqft",1950,1 Open Spaces,"$449,900",Washington
7751,7508 Eastern Ave NW,3 Beds,3 Baths,"1,088 sqft",1948,Open Parking,"$599,000",Washington


In [23]:
# remove duplicate entries (1580 rows!)
# If you want to preview this first, run: df_combined.drop_duplicates() and don't assign to variable
# so then it would not be overwritten
df_combined = df_combined.drop_duplicates()

In [24]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
7821,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
7822,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
7823,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
7824,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


In [26]:
# and now reset index
df_combined = df_combined.reset_index(drop = True)

In [27]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


###  Change Price Column

In [28]:
# rename price column and update existing dataframe
df_combined = df_combined.rename(columns = {"Price":"Price($)"})
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


###  Price Data : Cut '$' in Price

In [29]:
# we want to override only the Price($) column, not the whole dataframe
df_combined['Price($)'] = df_combined['Price($)'].str.strip('$')

In [30]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,1375000,Washington


###  Price Data : Cut ',' in Price

In [31]:
# can use strip or replace function
df_combined['Price($)'] = df_combined['Price($)'].str.replace(',', '')
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,1375000,Washington


### Change Area Column

In [32]:
# rename area column and update existing dataframe
df_combined = df_combined.rename(columns = {"Area":"Area(Sqft)"})
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,1375000,Washington


###  Area Data : Cut 'sqft' string element

In [33]:
# also white space prior to sqft
df_combined['Area(Sqft)'] = df_combined['Area(Sqft)'].str.strip(' sqft')
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,2000,1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,2212,2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,1600,1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,2445,1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,1200,1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,2078,1985,2 Car Garage,1375000,Washington


###  Area Data : Cut ',' string element

In [34]:
# can use strip or replace function
df_combined['Area(Sqft)'] = df_combined['Area(Sqft)'].str.replace(',', '')
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,2000,1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,2212,2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,1600,1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,2445,1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,1200,1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,2078,1985,2 Car Garage,1375000,Washington


### Bedrooms Data : Cut 'Beds' string element

In [35]:
df_combined['Bedrooms'] = df_combined['Bedrooms'].str.strip(' Beds')
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4,3 Baths,2000,1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4,3 Baths,2212,2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2,1 Bath,815,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3,2 Baths,1600,1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4,3 Baths,2445,1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3,4 Baths,1200,1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2,1 Bath,835,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1,1 Bath,743,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3,3 Baths,2078,1985,2 Car Garage,1375000,Washington


In [36]:
# show summary of grouped, counted Bedrooms values
df_combined['Bedrooms'].value_counts()

3         2122
2         1376
4         1312
5          525
1          473
6          214
Studio      80
7           75
8           37
9           17
11           5
10           3
12           3
13           1
84           1
14           1
24           1
Name: Bedrooms, dtype: int64

###  Bedrooms Data: Change "Studio" to = 1

In [37]:
# Use of lambda fxn: if column has 'Studio', convert to 1 otherwise do nothing
df_combined['Bedrooms'] = df_combined['Bedrooms'].apply(lambda x: 1 if 'Studio' in x else x)
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4,3 Baths,2000,1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4,3 Baths,2212,2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2,1 Bath,815,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3,2 Baths,1600,1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4,3 Baths,2445,1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3,4 Baths,1200,1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2,1 Bath,835,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1,1 Bath,743,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3,3 Baths,2078,1985,2 Car Garage,1375000,Washington


In [38]:
# check if a certain string element is inside a dataframe
# recall that in dataframe column = key and results (entires) are values
if 'Washington' in df_combined.values:
    print('Element is in the dataframe')

Element is in the dataframe


In [39]:
# can cross-check and verify that 'Studio' is totally gone
if 'Studio' in df_combined.values:
    print('Element is in the dataframe')

### Bathrooms Data : Cut 'Baths' string element

In [40]:
# This cut will also encompass 'Bath'
df_combined['Bathrooms'] = df_combined['Bathrooms'].str.strip(' Baths')
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4,3,2000,1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4,3,2212,2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2,1,815,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3,2,1600,1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4,3,2445,1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3,4,1200,1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2,1,835,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1,1,743,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3,3,2078,1985,2 Car Garage,1375000,Washington


### Parking Data : yes/no

In [41]:
df_combined['Parking'].value_counts()

Garage               1623
No Info              1136
2 Car Garage         1020
Open Parking          805
1 Car Garage          487
3 Car Garage          333
Attached Garage       158
None                  142
1 Parking Spaces      134
4 Car Garage           80
1 Carport Spaces       69
1 Open Spaces          39
2 Open Spaces          37
2 Parking Spaces       35
2 Carport Spaces       28
Carport                23
0 Open Spaces          17
6 Car Garage           14
5 Car Garage           11
4 Parking Spaces        8
3 Open Spaces           6
4 Open Spaces           5
5 Open Spaces           5
8 Car Garage            4
3 Parking Spaces        4
3 Carport Spaces        3
8 Parking Spaces        3
9 Car Garage            2
6 Parking Spaces        2
0 Carport Spaces        1
5 Carport Spaces        1
7 Car Garage            1
4 Carport Spaces        1
11 Car Garage           1
10 Car Garage           1
3.5 Car Garage          1
20 Car Garage           1
5 Parking Spaces        1
9 Parking Sp

In [43]:
# nicely group types of parking opportunities
# groupings don't fully make sense (missing 'Parking Spaces' for 'yes') but go with it...
df_combined['Parking'] = df_combined['Parking'].apply(lambda x: 'yes' if 'Garage' in x or 'Car' in x or 'Open' in x or 'Carport' in x
                                                       else 'no')
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4,3,2000,1975,yes,330000,Albuquerque
1,460 Wesley Ct SW,4,3,2212,2003,yes,289900,Albuquerque
2,1307 Bernardino Rd NW,2,1,815,1948,no,155000,Albuquerque
3,5215 Cimarron Rd NW,3,2,1600,1977,yes,269900,Albuquerque
4,2804 Charleston St NE,4,3,2445,1963,yes,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3,4,1200,1933,yes,829900,Washington
6242,1111 25th St NW #304,2,1,835,2005,yes,549900,Washington
6243,922 24th St NW #5B,1,1,743,1962,yes,319500,Washington
6244,1099 22nd St NW #401,3,3,2078,1985,yes,1375000,Washington


In [44]:
df_combined['Parking'].value_counts()

yes    4780
no     1466
Name: Parking, dtype: int64

###  Convert all numbers to integers

#### Bedrooms

In [49]:
# check if bedrooms column is a numeric element
is_numeric_dtype(df_combined['Bedrooms'])
# initially outputs False

True

In [47]:
# check if bedrooms column is a string element
is_string_dtype(df_combined['Bedrooms'])

True

In [48]:
# cast Bedroom data into numeric
df_combined['Bedrooms'] = df_combined['Bedrooms'].astype(int)

#### Bathrooms

In [50]:
df_combined['Bathrooms'] = df_combined['Bathrooms'].astype(int)

#### Area(Sqft)

In [51]:
df_combined['Area(Sqft)'] = df_combined['Area(Sqft)'].astype(int)

#### Year Built

In [54]:
# check if numeric
is_numeric_dtype(df_combined['Year Built'])
# initially outputs False

False

In [55]:
# try to change to numeric
df_combined['Year Built'] = df_combined['Year Built'].astype(int)

ValueError: invalid literal for int() with base 10: 'No Info'

In [56]:
# Find all entries where Year Built contains 'No Info'
df_combined[df_combined['Year Built'] == 'No Info']
# So do we delete these rows or find a way to keep?  Previously removed rows with missing values.


Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location
10,2304 General Marshall St NE,3,2,1877,No Info,yes,304900,Albuquerque
24,6408 Los Pueblos Pl NW,3,3,1930,No Info,yes,250000,Albuquerque
25,2128 Altura Verde Ln NE,3,3,2060,No Info,yes,299900,Albuquerque
33,9720 Stone St NW,3,3,2378,No Info,yes,314900,Albuquerque
54,1705 Los Jardines Pl NW,2,2,864,No Info,yes,159000,Albuquerque
...,...,...,...,...,...,...,...,...
5517,875 California St #402,2,2,1444,No Info,yes,2320000,San Francisco
5532,960 Market St #304,1,1,344,No Info,no,520000,San Francisco
5534,821 Folsom St #310,1,1,717,No Info,no,779000,San Francisco
5559,72 Townsend St #707,1,2,851,No Info,yes,1250000,San Francisco


In [58]:
# Another option: replace with a '0'
df_combined['Year Built'] = df_combined['Year Built'].apply(lambda x: 0 if 'No Info' in x else x)
# get error message: argument of type 'float' is not iterable

TypeError: argument of type 'float' is not iterable

In [65]:
# Let's try instead to first save results in a string
df_combined['Year Built'] = df_combined['Year Built'].astype(str)

In [66]:
# look at unique value counts
df_combined['Year Built'].value_counts()
# ahhh we have float-like values (althought these are strings)

2021.0    306
2021      175
1925      156
1900      153
1920      144
         ... 
1876        1
1875.0      1
1903        1
1893.0      1
1874        1
Name: Year Built, Length: 288, dtype: int64

In [68]:
# use lambda to replace (string) floats with empty string
df_combined['Year Built'] = df_combined['Year Built'].apply(lambda x: x.replace('.0', '') if '.0' in x else x)
df_combined['Year Built'].value_counts()

2021    481
1900    210
1920    191
1925    184
2006    157
       ... 
1809      1
1752      1
1876      1
1834      1
1874      1
Name: Year Built, Length: 157, dtype: int64

In [69]:
# NOW let's try replacing 'No Info' with a '0'
df_combined['Year Built'] = df_combined['Year Built'].apply(lambda x: 0 if 'No Info' in x else x)

In [72]:
# And now save as integer
df_combined['Year Built'] = df_combined['Year Built'].astype(int)

In [73]:
# Cross-check to verify that this coumn is now numeric
is_numeric_dtype(df_combined['Year Built'])

True

#### Price in $

In [74]:
is_numeric_dtype(df_combined['Price($)'])

False

In [75]:
# Try to save as integer
df_combined['Price($)'] = df_combined['Price($)'].astype(int)
# get error: invalid literal for int() with  base 10: '294900+' so some entries have a plus sign

ValueError: invalid literal for int() with base 10: '294900+'

In [76]:
# Let's trim off this plus sign
df_combined['Price($)'] = df_combined['Price($)'].apply(lambda x: x.replace('+', '') if '+' in x else x) 

In [77]:
# And try to cast into integer again
df_combined['Price($)'] = df_combined['Price($)'].astype(int)

In [78]:
# verify this is now a numeric data type
is_numeric_dtype(df_combined['Price($)'])

True

###  New Column 'price/sqft'

In [79]:
df_combined['price/sqft'] = df_combined['Price($)']/df_combined['Area(Sqft)']

In [80]:
df_combined
# we now have an extra column (price/sqft) but want to round to 2 decimal places

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location,Year built,price/sqft
0,9400 La Playa St NE,4,3,2000,1975,yes,330000,Albuquerque,1975,165.000000
1,460 Wesley Ct SW,4,3,2212,2003,yes,289900,Albuquerque,2003,131.057866
2,1307 Bernardino Rd NW,2,1,815,1948,no,155000,Albuquerque,1948,190.184049
3,5215 Cimarron Rd NW,3,2,1600,1977,yes,269900,Albuquerque,1977,168.687500
4,2804 Charleston St NE,4,3,2445,1963,yes,450000,Albuquerque,1963,184.049080
...,...,...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3,4,1200,1933,yes,829900,Washington,1933,691.583333
6242,1111 25th St NW #304,2,1,835,2005,yes,549900,Washington,2005,658.562874
6243,922 24th St NW #5B,1,1,743,1962,yes,319500,Washington,1962,430.013459
6244,1099 22nd St NW #401,3,3,2078,1985,yes,1375000,Washington,1985,661.693936


In [82]:
df_combined['price/sqft'] = df_combined['price/sqft'].round(2)
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location,Year built,price/sqft
0,9400 La Playa St NE,4,3,2000,1975,yes,330000,Albuquerque,1975,165.00
1,460 Wesley Ct SW,4,3,2212,2003,yes,289900,Albuquerque,2003,131.06
2,1307 Bernardino Rd NW,2,1,815,1948,no,155000,Albuquerque,1948,190.18
3,5215 Cimarron Rd NW,3,2,1600,1977,yes,269900,Albuquerque,1977,168.69
4,2804 Charleston St NE,4,3,2445,1963,yes,450000,Albuquerque,1963,184.05
...,...,...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3,4,1200,1933,yes,829900,Washington,1933,691.58
6242,1111 25th St NW #304,2,1,835,2005,yes,549900,Washington,2005,658.56
6243,922 24th St NW #5B,1,1,743,1962,yes,319500,Washington,1962,430.01
6244,1099 22nd St NW #401,3,3,2078,1985,yes,1375000,Washington,1985,661.69


### Check for outliers

In [83]:
df_combined['price/sqft'].describe()
# mix and max values seem pretty extreme

count      6246.000000
mean        616.918277
std        7205.666641
min           0.440000
25%         189.710000
50%         311.160000
75%         596.375000
max      560000.000000
Name: price/sqft, dtype: float64

In [84]:
# Locate entries where price/sqft is less than 10
df_combined.loc[df_combined['price/sqft'] < 10]
# dang.  cross-check an address to visually verify in online listing
# decide if wish to keep or remove

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location,Year built,price/sqft
1144,1216 E Vermont St,3,3,4317,1880,no,1895,Indianapolis,1880,0.44
1470,3135 Ralston Ave,1,1,1214,1910,no,6000,Indianapolis,1910,4.94
3309,21-77 33rd St #4G,2,1,430000,1923,no,329000,New York,1923,0.77
3448,60 Knolls Cres #1J,3,1,237800,1953,yes,190162,New York,1953,0.8
3734,72-61 113th St #5B,1,1,137820,1951,yes,178000,New York,1951,1.29


In [85]:
# Locate entries where price/sqft equals 560000
df_combined.loc[df_combined['price/sqft'] == 560000]
# my stars.  can cross-check address online, note lists sqft = 1, so likely listing error

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location,Year built,price/sqft
3564,3069 Heath Ave,3,2,1,1899,no,560000,New York,1899,560000.0


In [87]:
# drop this outlier
df_combined = df_combined.drop(3564)

KeyError: '[3564] not found in axis'

In [88]:
# and now there is 1 row less in dataframe
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area(Sqft),Year Built,Parking,Price($),Location,Year built,price/sqft
0,9400 La Playa St NE,4,3,2000,1975,yes,330000,Albuquerque,1975,165.00
1,460 Wesley Ct SW,4,3,2212,2003,yes,289900,Albuquerque,2003,131.06
2,1307 Bernardino Rd NW,2,1,815,1948,no,155000,Albuquerque,1948,190.18
3,5215 Cimarron Rd NW,3,2,1600,1977,yes,269900,Albuquerque,1977,168.69
4,2804 Charleston St NE,4,3,2445,1963,yes,450000,Albuquerque,1963,184.05
...,...,...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3,4,1200,1933,yes,829900,Washington,1933,691.58
6242,1111 25th St NW #304,2,1,835,2005,yes,549900,Washington,2005,658.56
6243,922 24th St NW #5B,1,1,743,1962,yes,319500,Washington,1962,430.01
6244,1099 22nd St NW #401,3,3,2078,1985,yes,1375000,Washington,1985,661.69


###  Save in Excel

In [89]:
df_combined.to_excel('cleaned_data.xlsx', index = False)