### Imports

In [13]:
import pandas as pd
import os
import seaborn as sns

### Concatenate 10 datasets

In [3]:
# current working directory
path = os.getcwd()

# show all files in current working directory
files = os.listdir(path)
files

['.ipynb_checkpoints',
 'dataframe_albuquerque.xlsx',
 'dataframe_colorado.xlsx',
 'dataframe_indianapolis.xlsx',
 'dataframe_las_vegas.xlsx',
 'dataframe_miami.xlsx',
 'dataframe_new_york.xlsx',
 'dataframe_philadelphia.xlsx',
 'dataframe_san_diego.xlsx',
 'dataframe_san_francisco.xlsx',
 'dataframe_washington.xlsx',
 'RealEstateDataCleaning.ipynb']

In [4]:
files_needed = ['dataframe_albuquerque.xlsx',
 'dataframe_colorado.xlsx',
 'dataframe_indianapolis.xlsx',
 'dataframe_las_vegas.xlsx',
 'dataframe_miami.xlsx',
 'dataframe_new_york.xlsx',
 'dataframe_philadelphia.xlsx',
 'dataframe_san_diego.xlsx',
 'dataframe_san_francisco.xlsx',
 'dataframe_washington.xlsx']

In [5]:
len(files_needed)

10

In [7]:
# initialize dataframe
df_combined = pd.DataFrame()

for f in files_needed:
    df_combined = df_combined.append(pd.read_excel(f, 'Sheet1'))

In [8]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
855,1715 Euclid St NW,6 Beds,,,1906,Open Parking,"$1,900,000",Washington
856,2015 Q St NW,6 Beds,,"5,285 sqft",1925,2 Parking Spaces,"$2,750,000",Washington
857,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
858,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


# Data Cleaning Process

- Check for missing Data 

- Reset/Change Index number (e.g. row number)

- Check Doubles

---

- Change Price Column

- Price Data : Cut "$" in Price

- Price Data : Cut "," in Price
---

- Change Area Column

- Area Data : Cut "sqft" string element

- Area Data : Cut "," string element
---

- Bedrooms Data : Cut 'Beds' string element

- Bedrooms Data: "Studio" = 1 room

- Bathrooms Data : Cut 'Baths' string element
---

- Parking Data : yes/no
---

- Convert all numbers to integers
---

- New Column "price/sqft"
- Check for Outliers
- Save to Excel


### Check for missing data

In [9]:
df_combined.info()
# data missing for beds, baths, etc, looks complete for Address, Location

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8755 entries, 0 to 859
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     8755 non-null   object
 1   Bedrooms    8255 non-null   object
 2   Bathrooms   8145 non-null   object
 3   Area        8020 non-null   object
 4   Year Built  8333 non-null   object
 5   Parking     8263 non-null   object
 6   Price       8752 non-null   object
 7   Location    8755 non-null   object
dtypes: object(8)
memory usage: 615.6+ KB


In [11]:
# check for at least 1 missing value in a row
df_combined[df_combined.isna().any(axis = 1)]

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
370,Pajarito Mesa,,,,,,"$13,000",Albuquerque
376,1033 Forrester Ave NW,Studio,,600 sqft,1977,No Info,"$205,000",Albuquerque
400,1920 Broadway Blvd SE,2 Beds,1 Bath,,1962,No Info,"$245,000",Albuquerque
402,1990 1/2 Cherokee Rd NW,,,,,,"$175,000",Albuquerque
404,1301 Iron Ave SW,2 Beds,2 Baths,,1916,No Info,"$355,000",Albuquerque
...,...,...,...,...,...,...,...,...
798,235 Emerson St NW #101,1 Bed,1 Bath,,1929,Open Parking,"$209,900",Washington
817,2308 Tracy Pl NW,6 Beds,6 Baths,,1919,Garage,"$4,888,000",Washington
818,2017 Rear 2 St NE,,,,,,"$295,000",Washington
855,1715 Euclid St NW,6 Beds,,,1906,Open Parking,"$1,900,000",Washington


In [16]:
# check for missing values in a heat map
sns.heatmap(df_combined.isna(), yticklabels = False, cbar = False, cmap = 'BuGn')
# unsure why heatmap not visualizing...

<AxesSubplot:>

In [17]:
# drop all entries where 1 value (in a row) is missing
# need to restate variable in order to propery overwrite
df_combined = df_combined.dropna()

In [18]:
# now all rows have full data
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7826 entries, 0 to 859
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     7826 non-null   object
 1   Bedrooms    7826 non-null   object
 2   Bathrooms   7826 non-null   object
 3   Area        7826 non-null   object
 4   Year Built  7826 non-null   object
 5   Parking     7826 non-null   object
 6   Price       7826 non-null   object
 7   Location    7826 non-null   object
dtypes: object(8)
memory usage: 550.3+ KB


###  Reset/Change Index

In [19]:
# since all files were combined the row numbers are not consecutive through the dataframe
# now we fix that
df_combined = df_combined.reset_index(drop = True)

In [20]:
# now there is a unique index for each row
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
7821,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
7822,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
7823,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
7824,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


###  Check for Outliers and Doubles

In [21]:
# check for duplicates, outputs number of duplicates
df_combined.duplicated().sum()

1580

In [22]:
df_combined.loc[df_combined.duplicated(), :]
# can additionally cross-check via manual "find all" of example address in raw data file

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
80,1829 Illinois St NE,5 Beds,2 Baths,"1,797 sqft",1954,1 Car Garage,"$225,000",Albuquerque
81,9109 Crestwood Ave NE,3 Beds,3 Baths,"2,183 sqft",1963,2 Car Garage,"$320,000",Albuquerque
82,3401 Cagua Dr NE,3 Beds,2 Baths,"1,831 sqft",1955,2 Carport Spaces,"$269,000",Albuquerque
83,1325 Hertz Dr SE,3 Beds,3 Baths,"3,335 sqft",1975,2 Car Garage,"$435,000",Albuquerque
84,1020 Indiana St SE,4 Beds,2 Baths,"1,950 sqft",1953,Garage,"$299,900",Albuquerque
...,...,...,...,...,...,...,...,...
7748,525 Water St SW #420,3 Beds,3 Baths,"1,724 sqft",2016,2 Car Garage,"$1,375,000",Washington
7749,1032 Lamont St NW #A,3 Beds,3 Baths,"1,464 sqft",1907,Open Parking,"$749,000",Washington
7750,2403 Savannah St SE,3 Beds,2 Baths,"1,405 sqft",1950,1 Open Spaces,"$449,900",Washington
7751,7508 Eastern Ave NW,3 Beds,3 Baths,"1,088 sqft",1948,Open Parking,"$599,000",Washington


In [23]:
# remove duplicate entries (1580 rows!)
# If you want to preview this first, run: df_combined.drop_duplicates() and don't assign to variable
# so then it would not be overwritten
df_combined = df_combined.drop_duplicates()

In [24]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
7821,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
7822,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
7823,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
7824,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


In [26]:
# and now reset index
df_combined = df_combined.reset_index(drop = True)

In [27]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


###  Change Price Column

In [28]:
# rename price column and update existing dataframe
df_combined = df_combined.rename(columns = {"Price":"Price($)"})
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,"$330,000",Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,"$289,900",Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,"$155,000",Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,"$269,900",Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,"$450,000",Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,"$829,900",Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,"$549,900",Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,"$319,500",Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,"$1,375,000",Washington


###  Price Data : Cut '$' in Price

In [29]:
# we want to override only the Price($) column, not the whole dataframe
df_combined['Price($)'] = df_combined['Price($)'].str.strip('$')

In [30]:
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,1375000,Washington


###  Price Data : Cut ',' in Price

In [31]:
# can use strip or replace function
df_combined['Price($)'] = df_combined['Price($)'].str.replace(',', '')
df_combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price($),Location
0,9400 La Playa St NE,4 Beds,3 Baths,"2,000 sqft",1975,2 Car Garage,330000,Albuquerque
1,460 Wesley Ct SW,4 Beds,3 Baths,"2,212 sqft",2003,2 Car Garage,289900,Albuquerque
2,1307 Bernardino Rd NW,2 Beds,1 Bath,815 sqft,1948,No Info,155000,Albuquerque
3,5215 Cimarron Rd NW,3 Beds,2 Baths,"1,600 sqft",1977,2 Car Garage,269900,Albuquerque
4,2804 Charleston St NE,4 Beds,3 Baths,"2,445 sqft",1963,2 Car Garage,450000,Albuquerque
...,...,...,...,...,...,...,...,...
6241,2232 Mount View Pl SE,3 Beds,4 Baths,"1,200 sqft",1933,Open Parking,829900,Washington
6242,1111 25th St NW #304,2 Beds,1 Bath,835 sqft,2005,1 Car Garage,549900,Washington
6243,922 24th St NW #5B,1 Bed,1 Bath,743 sqft,1962,1 Car Garage,319500,Washington
6244,1099 22nd St NW #401,3 Beds,3 Baths,"2,078 sqft",1985,2 Car Garage,1375000,Washington


### Change Area Column

###  Area Data : Cut 'sqft' string element

###  Area Data : Cut ',' string element

### Bedrooms Data : Cut 'Beds' string element

###  Bedrooms Data: "Studio" = 1 room

### Bathrooms Data : Cut 'Baths' string element

### Parking Data : yes/no

###  Convert all numbers to integers

#### Bedrooms

#### Bathrooms

#### Area(Sqft)

#### Year Built

#### Price in $

###  New Column 'price/sqft'

### Check for outliers

###  Save in Excel