### Import Libraries 


 - The os module allows us to interact with operating systems including changing working directory
 - The pandas module allows us to read, clean and preprocess the output CSV files that were web scraped
 - The glob module allows us to search through file paths and search for files that meet a specific pattern in this example we look for .csv patterns
 - The bs4 module allows you to pull data from HTML document
 

In [27]:
import glob as gb
import pandas as pd 
import os
from bs4 import BeautifulSoup

### Set Path
 - Set your path to where CSV outputs of your zillow webscraping was exported to us the os chdir method

In [3]:
path = "C:\\Users\\padu\\Desktop\\Zillow\\Final\\InitialCraiglist"
os.chdir(path)

### Get a list of all CSV outputs 
- Use the glob method get generate a list of all your output csv in your directory


In [4]:
All = gb.glob(path + "/*.csv")


### Concatenate CSV
- combine all files as a single CSV by concatenating them. To do this:
    - loop through the list of CSV in your path and use the pandas.read_csv method to read them. It will create a generator
    - Afterward use the pd.concat method to concatenate all your csv files 

In [5]:
Craiglist = (pd.read_csv(file) for file in All)
Craiglist  = pd.concat(Craiglist, ignore_index=True)

In [6]:
Craiglist.shape

(217273, 13)

In [7]:
len(Craiglist)

217273

In [7]:
Craiglist.columns

Index(['Post_datetime', 'Post Price', 'Post hood', 'Post title', 'PostURL',
       'address', 'overview', 'bedroom_baths', 'Title', 'Price', 'Other',
       'lat', 'lon'],
      dtype='object')

In [8]:
Craiglist

Unnamed: 0,Post_datetime,Post Price,Post hood,Post title,PostURL,address,overview,bedroom_baths,Title,Price,Other,lat,lon
0,2021-03-15 13:47,$829\nimage 1 of 7<>,,"<a class=""result-title hdrlnk"" data-id=""729170...",https://charlotte.craigslist.org/apa/d/charlot...,8061 Woodscape Dr.,"<section id=""postingbody"">\n<div class=""print-...",1BR / 1Ba,Great and spacious 1 bedroom 1 bath apartment!...,$829,"<p class=""attrgroup"">\n<span class=""shared-lin...",,
1,2021-03-15 13:47,"$1,152\nimage 1 of 8<>",,"<a class=""result-title hdrlnk"" data-id=""729178...",https://charlotte.craigslist.org/apa/d/concord...,7850 Waterway Drive NW,"<section id=""postingbody"">\n<div class=""print-...",1BR / 1Ba,"Large Walk-in Closets, BBQ/Picnic Area, Dog Park","$1,152","<p class=""attrgroup"">\n<span class=""shared-lin...",,
2,2021-03-15 13:45,"$1,284\nimage 1 of 9<>",,"<a class=""result-title hdrlnk"" data-id=""729177...",https://charlotte.craigslist.org/apa/d/charlot...,11505 Masterton Road,"<section id=""postingbody"">\n<div class=""print-...",2BR / 2Ba,"Playground, Tile Backsplash, Spa","$1,284","<p class=""attrgroup"">\n<span class=""shared-lin...",,
3,2021-03-15 13:45,"$1,364\nimage 1 of 10<>","<span class=""result-hood""> (Charlotte)</span>","<a class=""result-title hdrlnk"" data-id=""729178...",https://charlotte.craigslist.org/apa/d/charlot...,730 Hawthorne Ln,"<section id=""postingbody"">\n<div class=""print-...",1BR / 1Ba,"Key fob access, Uptown Charlotte views, 80 Wal...","$1,364","<p class=""attrgroup"">\n<span class=""shared-lin...",,
4,2021-03-15 13:43,"$1,295\nimage 1 of 8<>",,"<a class=""result-title hdrlnk"" data-id=""729171...",https://charlotte.craigslist.org/apa/d/charlot...,1825 Carrington Oaks Drive,"<section id=""postingbody"">\n<div class=""print-...",2BR / 2Ba,"Ceiling Fans and Track Lighting, 9' Ceilings, ...","$1,295","<p class=""attrgroup"">\n<span class=""shared-lin...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217268,2022-01-26 10:15,"$1,795","<span class=""result-hood""> (Indian Trail )</s...",Love Where You Live!,https://charlotte.craigslist.org/apa/d/matthew...,321 E. Park Road,"<section id=""postingbody"">\n<div class=""print-...",3BR / 2Ba,Love Where You Live!,"$1,795","<p class=""attrgroup"">\n<span class=""shared-lin...",35.077413,-80.672131
217269,2022-01-26 10:07,"$2,475","<span class=""result-hood""> (4928 Old Pineville...","Brand New -Dual Sinks, Spacious Closest, Prep ...",https://charlotte.craigslist.org/apa/d/charlot...,4928 Old Pineville Rd,"<section id=""postingbody"">\n<div class=""print-...",3BR / 3.5Ba,"Brand New -Dual Sinks, Spacious Closest, Prep ...","$2,475","<p class=""attrgroup"">\n<span class=""shared-lin...",35.171400,-80.908400
217270,2022-01-26 10:06,"$1,670","<span class=""result-hood""> (517 Pink Moon Driv...",Largest Two Bedrooms in Rock Hill!!,https://charlotte.craigslist.org/apa/d/fort-mi...,517 Pink Moon Drive,"<section id=""postingbody"">\n<div class=""print-...",2BR / 2Ba,Largest Two Bedrooms in Rock Hill!!,"$1,670","<p class=""attrgroup"">\n<span class=""shared-lin...",34.973637,-80.978508
217271,2022-01-26 10:05,"$1,604","<span class=""result-hood""> (Plaza Midwood )</...",Spacious One Bedroom With Courtyard Views in P...,https://charlotte.craigslist.org/apa/d/charlot...,808 Hawthorne Ln,"<section id=""postingbody"">\n<div class=""print-...",1BR / 1Ba,Spacious One Bedroom With Courtyard Views in P...,"$1,604","<p class=""attrgroup"">\n<span class=""shared-lin...",35.219935,-80.818782


### Texting Cleaning

   - In this step we undertake initial cleaning of the description text associated with each rental listing. :
   
   - Because the description text is in html format, we use the html parser from beautifulsoup and the get_text() method to get the text
   
   - After extracting the text, we strip off extra line, and replace all standard text contact information text with blank text

In [10]:
results = []

for row in Craiglist['overview']:
    
    Firstcleaning = BeautifulSoup(row,'html.parser').get_text().rstrip('\n\n').strip()
    Secondcleaning = Firstcleaning.replace('QR Code Link to This Post','').replace("show contact info",'')
    data = Secondcleaning.strip('\n\n\n\n\n').replace('Call:  show contact inf','')
    
    results.append(data)

In [13]:
otherinformation = []

for row in Craiglist['Other']:
    Firstcleaning = BeautifulSoup(row,'html.parser').get_text().rstrip('\n\n').strip()
    otherinformation.append(Firstcleaning)

- Not all posting add the neighborhood information, for those with neighborhood information
- we strip off '()' from the neighborhood name, for those that are null, we keep them null
    

In [14]:
hood = []

for row in Craiglist['Post hood']:
    
    # Not all posting add the neighborhood information, for those with neighborhood information
    # we strip off '()' from the neighborhood name, for those that are null, we keep them null
    
    try:
        Firstcleaning = BeautifulSoup(row,'html.parser').get_text()
        Secondcleaning = Firstcleaning.strip(' (').strip(')')
        hood.append(Secondcleaning)  
    except:
        hood.append(row) 

In [26]:
sqft

[['1BR / 1Ba', '650ft2', 'available mar 20'],
 ['1BR / 1Ba', '890ft2', 'available now'],
 ['2BR / 2Ba', '1242ft2'],
 ['1BR / 1Ba', '677ft2', 'available now'],
 ['2BR / 2Ba', '1177ft2'],
 ['2BR / 2Ba', '1242ft2'],
 ['2BR / 2Ba', '1309ft2'],
 ['2BR / 2Ba', '1242ft2'],
 ['1BR / 1Ba', '680ft2', 'available apr 23'],
 ['2BR / 2Ba', '1242ft2'],
 ['1BR / 1Ba', '685ft2', 'available now'],
 ['1BR / 1Ba', '850ft2'],
 ['1BR / 1Ba', '685ft2', 'available apr 17'],
 ['2BR / 2Ba', '934ft2', 'available may 7'],
 ['3BR / 2Ba', '1381ft2'],
 ['1BR / 1Ba', '664ft2', 'available jun 25'],
 ['3BR / 2Ba', '1600ft2', 'available now'],
 ['2BR / 2Ba', '1005ft2', 'available now'],
 ['2BR / 2Ba', '1309ft2'],
 ['1BR / 1Ba', '745ft2', 'available apr 15'],
 ['1BR / 1Ba', '680ft2', 'available apr 23'],
 ['0BR / 1Ba', '590ft2', 'available now'],
 ['1BR / 1Ba', '850ft2'],
 ['2BR / 1Ba', '1026ft2', 'available now'],
 ['1BR / 1Ba', '639ft2'],
 ['2BR / 2Ba', '1242ft2'],
 ['1BR / 1Ba', '704ft2', 'available now'],
 ['2BR / 2B

In [25]:
# The floor size for each listing is in the 'other information' for example ['1BR / 1Ba', '650ft2', 'available mar 20']
# The 'other information' contains bathroom, bedrooms, floor size and available date for each listings
# This data is separated by '\n', hence we split them by '\n' and append them to a list


sqft = []

for row in otherinformation:
    split = row.split("\n")
    sqft.append(split)

    
# We check the length of the list to verifty if it's equal to 3 or equal 2: for example 
# When the length is 3 here is how the list loooks like for example: ['1BR / 1Ba', '650ft2', 'available mar 20']
# When the length is 2 here is how the list loooks like for example:  ['2BR / 2Ba', '1242ft2'],
# if the length is either (2 or 3) we append the second element of the list which is the floor size
# if the length is neither 2 or 3, then means there's no floor size information, hence we append zero for floorsize 
# for listings that meet this condition but doesn't have the floor size, we clean them in a later step
# Lastly, we use a list comprehension to replace the symbol 'ft2' with a blank space
    
floorsize = []

for row in sqft:
    if len(row) == 3 or len(row) == 2:
        floorsize.append(row[1])
    else:
        floorsize.append(str(0))

floorsizes = [text.replace("ft2","") for text in floorsize]


In [15]:
# Verify the length of the data

len(Craiglist), len(results), len(otherinformation),len(floorsizes)

(201023, 201023, 201023, 201023)

### Add the Clean data to the original data frame 

In [16]:


Craiglist['OverviewClean'] = results
Craiglist['Otherinformation'] = otherinformation
Craiglist['Neighborood'] = hood
Craiglist['floorsize'] = floorsizes 

In [17]:
Craiglist.columns

Index(['Post_datetime', 'Post Price', 'Post hood', 'Post title', 'PostURL',
       'address', 'overview', 'bedroom_baths', 'Title', 'Price', 'Other',
       'lat', 'lon', 'OverviewClean', 'Otherinformation', 'Neighborood',
       'floorsize'],
      dtype='object')

### Delete old columns in the dataframe 

In [18]:
columns = [ 'Post_datetime','Post Price', 'Post hood', 'Post title', 'overview','Otherinformation','Other']

In [19]:
Craiglist = Craiglist.drop(columns,axis =1)

### Check for duplicates
- Because we are web scraping daily, there multiple duplicates in our data, we check for those using the duplicated method and the sum method
- We can identify the unique listings by subracting the duplicated listings from all lisitings 

In [20]:
Craiglist.duplicated().sum()

156399

In [21]:
int(Craiglist.shape[0]) - Craiglist.duplicated().sum() 

44624

### Generate Unique Listings
- Create a variable that stores the unique listings 

In [22]:
FinalCragilist = Craiglist.drop_duplicates()


In [25]:
len(FinalCragilist)

44624

In [None]:
StreetAddress =  Craiglist.drop_duplicates(subset = "address")

In [None]:
len(StreetAddress)

### Change directory for  Output file 
 -  Change the output directory for your final CSV
 -  Since the cleaning including concatenating will be done multiple time for any addtional webscraping, you would have to export the final clean file to a different folder

In [23]:
path = "C:\\Users\\padu\\Desktop\\Zillow\\Final\\FinalCraiglist"
os.chdir(path)

### Export the final output as a CSV file

In [32]:
FinalCragilist = Craiglist.drop_duplicates()

In [35]:
FinalCragilist.to_csv("FinalCraiglistDate.csv", index=False)