## Importar librerias a utilizar

In [None]:
import pandas as pd

## Lectura de los datos

In [85]:
# Lectura de sitios dentro de Louisiana e Indiana (Google)
sitios= pd.read_parquet("../../Datasets/final-ds/sitios_combined_full") #Obtenido del archivo ETL/business_Google.ipynb

# Reviews Indiana
indiana= pd.read_parquet("../../Datasets/merge/ReviewIndiana")  # Obtenido del archivo ETL/estados

# Reviews Louisiana
louisiana= pd.read_parquet("../../Datasets/merge/ReviewLouisiana") # Obtenido del archivo ETL/estados

- Realizamos la union de los sitios con los estados que correspondan

In [86]:
indiana_sitios = indiana.merge(sitios,how="inner",on="gmap_id")
louisiana_sitios = louisiana.merge(sitios,how="inner",on="gmap_id")

- Observamos la cantidad de registros que quedaron en cada estado

In [87]:
louisiana_sitios.shape, indiana_sitios.shape

((609940, 12), (1126351, 12))

- Chequeo de valores nulos

In [74]:
louisiana_sitios.isna().sum()

user_id                0
rating                 0
text              280034
gmap_id                0
date                   0
name                   0
address             1260
latitude               0
longitude              0
category             202
avg_rating             0
num_of_reviews         0
dtype: int64

In [75]:
indiana_sitios.isna().sum()

user_id                0
rating                 0
text              494064
gmap_id                0
date                   0
name                   0
address             2735
latitude               0
longitude              0
category             468
avg_rating             0
num_of_reviews         0
dtype: int64

- Pasamos de `datetime` a `date`

In [88]:
# Louisiana
louisiana_sitios["date"] = louisiana_sitios["date"].dt.strftime("%Y-%m-%d").astype("date64[pyarrow]")

#Indiana
indiana_sitios["date"] = indiana_sitios["date"].dt.strftime("%Y-%m-%d").astype("date64[pyarrow]")

- Rellenamos los valores nulos de la columna `category` y la transformamos en string. Esto nos servira para el Analisis Exploratorio de Datos

In [89]:
louisiana_sitios.fillna({"category":"[]"},inplace=True)
indiana_sitios.fillna({"category":"[]"},inplace=True)

In [91]:
# Transformamos a string
louisiana_sitios["category"] = louisiana_sitios["category"].apply(lambda x: " ".join(x) if len(x)>0 else "")

In [92]:
indiana_sitios["category"] = indiana_sitios["category"].apply(lambda x: " ".join(x) if len(x)>0 else "")

- Realizamos un <i>merge</i> de las tablas para luego utilizar en la construccion de un `dashboard`

In [93]:
# Asignamos una columna extra "state" para diferenciar de que estado estamos hablando
louisiana_sitios["state"] = "Louisiana"
indiana_sitios["state"] = "Indiana"

In [94]:
google_states_reviews = pd.concat([indiana_sitios,louisiana_sitios])

- Funcion que filtra por categoria

In [127]:
def contiene(x : str):
    x = x.lower()
    if "hotel" in x:
        return "Hotel"
    if "restaurant" in x:
        return "Restaurant"
    if "bakery" in x or "cafe" in x or "café" in x:
        return "Bakery-Café"
    if "shopping" in x or "mall" in x:
        return "Shopping Mall"
    else:
        return "Others"


In [130]:
google_states_reviews["main_category"] = google_states_reviews["category"].apply(contiene)

- Observamos la distribucion de la columna `main_category`

In [131]:
google_states_reviews["main_category"].value_counts()

main_category
Others           1206845
Restaurant        463587
Shopping Mall      41170
Bakery-Café        22600
Hotel               2089
Name: count, dtype: int64

- Filtramos las ciudades

In [137]:
cities_dict = {
    'Louisiana': [
        "Abbeville", "Abita Springs", "Addis", "Albany", "Alexandria", "Ama", "Amelia",
        "Amite City", "Anacoco", "Angie", "Arabi", "Arcadia", "Arnaudville", "Ashland",
        "Athens", "Atlanta", "Avondale", "Baker", "Baldwin", "Ball", "Banks Springs",
        "Barataria", "Basile", "Baskin", "Bastrop", "Baton Rouge", "Bawcomville", "Bayou Blue",
        "Bayou Cane", "Bayou Country Club", "Bayou Gauche", "Bayou Goula", "Bayou L'Ourse",
        "Bayou Vista", "Belcher", "Belle Chasse", "Belle Rose", "Belmont", "Benton", "Bernice",
        "Berwick", "Bienville", "Blanchard", "Bogalusa", "Bonita", "Boothville", "Bordelonville",
        "Bossier City", "Bourg", "Boutte", "Boyce", "Branch", "Breaux Bridge", "Bridge City",
        "Broussard", "Brownfields", "Brownsville", "Brusly", "Bryceland", "Bunkie", "Buras",
        "Cade", "Calhoun", "Calvin", "Cameron", "Campti", "Cankton", "Carencro", "Carlyss",
        "Castor", "Catahoula", "Cecilia", "Center Point", "Central", "Chackbay", "Chalmette",
        "Charenton", "Chataignier", "Chatham", "Chauvin", "Cheneyville", "Choctaw", "Choudrant",
        "Church Point", "Claiborne", "Clarence", "Clarks", "Clayton", "Clinton", "Colfax",
        "Collinston", "Columbia", "Convent", "Converse", "Cotton Valley", "Cottonport",
        "Coushatta", "Covington", "Creola", "Crescent", "Crowley", "Cullen", "Cut Off",
        "Delcambre", "Delhi", "Delta", "Denham Springs", "DeQuincy", "DeRidder", "Des Allemands",
        "Destrehan", "Deville", "Dixie Inn", "Dodson", "Donaldsonville", "Downsville", "Doyline",
        "Dry Prong", "Dubach", "Dubberly", "Dulac", "Duson", "East Hodge", "Eastwood",
        "Eden Isle", "Edgard", "Edgefield", "Egan", "Elizabeth", "Elmwood", "Elton", "Empire",
        "Epps", "Erath", "Eros", "Erwinville", "Estelle", "Estherwood", "Eunice", "Evergreen",
        "Farmerville", "Fenton", "Ferriday", "Fifth Ward", "Fisher", "Florien", "Folsom",
        "Fordoche", "Forest", "Forest Hill", "Fort Jesup", "Fort Polk North", "Fort Polk South",
        "Franklin", "Franklinton", "French Settlement", "Frierson", "Galliano", "Gardere",
        "Garyville", "Georgetown", "Gibsland", "Gilbert", "Gilliam", "Gillis", "Glencoe",
        "Glenmora", "Gloster", "Golden Meadow", "Goldonna", "Gonzales", "Grambling", "Gramercy",
        "Grand Cane", "Grand Coteau", "Grand Isle", "Grand Point", "Gray", "Grayson", "Greenwood",
        "Gretna", "Grosse Tete", "Gueydan", "Hackberry", "Hahnville", "Hall Summit", "Hammond",
        "Harahan", "Harrisonburg", "Harvey", "Haughton", "Hayes", "Haynesville", "Heflin",
        "Henderson", "Hessmer", "Hester", "Hodge", "Homer", "Hornbeck", "Hosston", "Houma",
        "Ida", "Independence", "Inniswold", "Iota", "Iowa", "Jackson", "Jamestown",
        "Jean Lafitte", "Jeanerette", "Jefferson", "Jena", "Jennings", "Jonesboro", "Jonesville",
        "Jordan Hill", "Joyce", "Junction City", "Kaplan", "Keachi", "Kenner", "Kentwood",
        "Kilbourne", "Killian", "Killona", "Kinder", "Kraemer", "Krotz Springs", "Labadieville",
        "Lacassine", "Lacombe", "Lafayette", "Lafitte", "Lafourche Crossing", "Lake Arthur",
        "Lake Charles", "Lake Providence", "Lakeshore", "Lakeview", "Laplace", "Larose",
        "Lawtell", "Lecompte", "Leesville", "Lemannville", "Leonville", "Lillie", "Lisbon",
        "Livingston", "Livonia", "Lockport", "Lockport Heights", "Logansport", "Longstreet",
        "Longville", "Loreauville", "Lucky", "Luling", "Lutcher", "Lydia", "Madisonville",
        "Mamou", "Mandeville", "Mangham", "Mansfield", "Mansura", "Many", "Maringouin",
        "Marion", "Marksville", "Marrero", "Martin", "Mathews", "Maurice", "McNary", "Melville",
        "Mer Rouge", "Meraux", "Mermentau", "Merrydale", "Merryville", "Metairie", "Midway",
        "Milton", "Minden", "Minorca", "Monroe", "Montegut", "Monterey", "Montgomery",
        "Monticello", "Montpelier", "Montz", "Moonshine", "Mooringsport", "Moreauville",
        "Morgan City", "Morganza", "Morse", "Moss Bluff", "Mound", "Mount Lebanon",
        "Napoleonville", "Natalbany", "Natchez", "Natchitoches", "Nueva Iberia", "New Llano",
        "Nueva Orleans", "New Roads", "New Sarpy", "Newellton", "Noble", "Norco", "North Hodge",
        "North Vacherie", "Norwood", "Oak Grove", "Oak Hills Place", "Oak Ridge", "Oakdale",
        "Oberlin", "Oil City", "Old Jefferson", "Olla", "Opelousas", "Oretta", "Ossun",
        "Paincourtville", "Palmetto", "Paradis", "Parks", "Patterson", "Paulina", "Pearl River",
        "Pierre Part", "Pine Prairie", "Pineville", "Pioneer", "Pitkin", "Plain Dealing",
        "Plaquemine", "Plaucheville", "Pleasant Hill", "Pleasure Bend", "Point Place",
        "Pointe a la Hache", "Pollock", "Ponchatoula", "Port Allen", "Port Barre", "Port Sulphur",
        "Port Vincent", "Powhatan", "Poydras", "Prairieville", "Presquille", "Prien", "Prospect",
        "Provencal", "Quitman", "Raceland", "Rayne", "Rayville", "Red Chute", "Reddell",
        "Reeves", "Reserve", "Richmond", "Richwood", "Ridgecrest", "Ringgold", "River Ridge",
        "Roanoke", "Robeline", "Rock Hill", "Rodessa", "Romeville", "Rosedale", "Roseland",
        "Rosepine", "Ruston", "Saline", "Sarepta", "Schriever", "Scott", "Shenandoah",
        "Shongaloo", "Shreveport", "Sibley", "Sicily Island", "Sikes", "Simmesport", "Simpson",
        "Simsboro", "Singer", "Siracusaville", "Slaughter", "Slidell", "Sorrel", "Sorrento",
        "South Mansfield", "South Vacherie", "Spearsville", "Spokane", "Springfield",
        "Springhill", "St. Francisville", "St. Gabriel", "St. James", "St. Joseph",
        "St. Martinville", "St. Maurice", "St. Rose", "Stanley", "Starks", "Start",
        "Sterlington", "Stonewall", "Sugartown", "Sulphur", "Sun", "Sunset", "Supreme",
        "Swartz", "Taft", "Tallulah", "Tangipahoa", "Terrytown", "Thibodaux", "Tickfaw",
        "Timberlane", "Triumph", "Tullos", "Turkey Creek", "Union", "Urania", "Vacherie",
        "Vidalia", "Ville Platte", "Vinton", "Vivian", "Waggaman", "Walnut Hill", "Warren",
        "Washington", "West Baton Rouge", "West Monroe", "Westwego", "White Castle",
        "Winnfield", "Woodworth", "Woodville", "Zachary", "Zachary", "Zwolle"
    ],
    'Indiana' : [
        "Aberdeen", "Advance", "Akron", "Álamo", "Albany", "Albion", "Alexandria",
        "Alfordsville", "Alton", "Altona", "Ambia", "Amboy", "Americus", "Amo",
        "Anderson", "Andrews", "Angola", "Arcadia", "Argos", "Arlington", "Ashley",
        "Atlanta", "Attica", "Auburn", "Aurora", "Austin", "Avilla", "Avoca", "Avon",
        "Bainbridge", "Bargersville", "Bass Lake", "Batesville", "Battle Ground",
        "Bedford", "Beech Grove", "Berne", "Bethany", "Beverly Shores", "Bicknell",
        "Birdseye", "Blanford", "Bloomfield", "Bloomingdale", "Bloomington", "Blountsville",
        "Bluffton", "Boonville", "Borden", "Boston", "Boswell", "Bourbon", "Brazil",
        "Bremen", "Bright", "Bristol", "Brook", "Brooklyn", "Brooksburg", "Brookston",
        "Brookville", "Brownsburg", "Brownstown", "Bruceville", "Bryant", "Buck Creek",
        "Buffalo", "Bunker Hill", "Burket", "Burlington", "Burnettsville", "Burns City",
        "Burns Harbor", "Butler", "Butlerville", "Cádiz", "Cambridge City", "Camden",
        "Campbellsburg", "Canaan", "Cannelburg", "Cannelton", "Carbon", "Carlisle",
        "Carmel", "Carthage", "Cayuga", "Cedar Grove", "Cedar Lake", "Center Point",
        "Centerville", "Chalmers", "Chandler", "Charlestown", "Chesterfield", "Chesterton",
        "Chrisney", "Churubusco", "Cicero", "Clarks Hill", "Clarksburg", "Clarksville",
        "Clay City", "Claypool", "Clayton", "Clear Lake", "Clermont", "Clifford", "Clinton",
        "Cloverdale", "Coalmont", "Coatesville", "Colburn", "Colfax", "Collegeville",
        "Columbia City", "Columbus", "Connersville", "Converse", "Cordry Sweetwater Lakes",
        "Corunna", "Corydon", "Country Club Heights", "Country Squire Lakes", "Covington",
        "Crandall", "Crane", "Crawfordsville", "Cromwell", "Crothersville", "Crown Point",
        "Crows Nest", "Culver", "Cumberland", "Cynthiana", "Dale", "Daleville", "Dana",
        "Danville", "Darlington", "Darmstadt", "Dayton", "De Motte", "Decatur", "Decker",
        "Delphi", "Denver", "Deputy", "Dillsboro", "Dover Hill", "Dresser", "Dublin",
        "Dubois", "Dugger", "Dune Acres", "Dunkirk", "Dunlap", "Dunreith", "Dupont",
        "Dyer", "Earl Park", "East Chicago", "East Enterprise", "East Germantown", "Eaton",
        "Economy", "Edgewood", "Edinburgh", "Edwardsport", "Elberfeld", "Elizabeth",
        "Elizabethtown", "Elkhart", "Ellettsville", "Elnora", "Elwood", "Emison", "English",
        "Etna Green", "Evansville", "Fairland", "Fairmount", "Fairview Park", "Farmersburg",
        "Farmland", "Ferdinand", "Fillmore", "Fish Lake", "Fishers", "Flora", "Florence",
        "Fontanet", "Fort Branch", "Fort Wayne", "Fortville", "Fountain City", "Fowler",
        "Fowlerton", "Francesville", "Francisco", "Frankfort", "Franklin", "Frankton",
        "Fredericksburg", "Freelandville", "Freetown", "Fremont", "French Lick", "Fulton",
        "Galena", "Galveston", "Garrett", "Gary", "Gas City", "Gaston", "Geneva",
        "Gentryville", "Georgetown", "Glenwood", "Goodland", "Goshen", "Gosport", "Grabill",
        "Grandview", "Granger", "Greencastle", "Greendale", "Greenfield", "Greens Fork",
        "Greensboro", "Greensburg", "Greentown", "Greenville", "Greenwood", "Griffin",
        "Griffith", "Grissom AFB", "Hagerstown", "Hamilton", "Hamlet", "Hammond", "Hanna",
        "Hanover", "Hardinsburg", "Harlan", "Harmony", "Harrodsburg", "Hartford City",
        "Hartsville", "Hatfield", "Haubstadt", "Hayden", "Hazleton", "Hebron", "Henryville",
        "Herbst", "Heritage Lake", "Hidden Valley", "Highland", "Hillsboro", "Hoagland",
        "Hobart", "Holland", "Holton", "Homecroft", "Hope", "Howe", "Hudson", "Hudson Lake",
        "Huntertown", "Huntingburg", "Huntington", "Hymera", "Idaville", "Indian Heights",
        "Indian Village", "Indianapolis", "Ingalls", "Jalapa", "Jamestown", "Jasonville",
        "Jasper", "Jeffersonville", "Jonesboro", "Jonesville", "Kempton", "Kendallville",
        "Kennard", "Kent", "Kentland", "Kewanna", "Kimmell", "Kingman", "Kingsbury",
        "Kingsford Heights", "Kirklin", "Knightstown", "Knightsville", "Knox", "Kokomo",
        "Koontz Lake", "Kouts", "La Crosse", "La Fontaine", "La Paz", "La Porte", "Laconia",
        "Ladoga", "Lafayette", "Lagrange", "Lagro", "Lake Dalecarlia", "Lake Holiday",
        "Lake Santee", "Lake Station", "Lake Village", "Lakes of the Four Seasons",
        "Laketon", "Lakeville", "Landess", "Lanesville", "Lapel", "Larwill", "Laurel",
        "Lawrence", "Lawrenceburg", "Leavenworth", "Lebanon", "Leesburg", "Leo-Cedarville",
        "Lewisville", "Liberty", "Ligonier", "Linden", "Linton", "Little York", "Livonia",
        "Lizton", "Logansport", "Long Beach", "Loogootee", "Losantville", "Lowell", "Lynn",
        "Lynnville", "Lyons", "Mackey", "Macy", "Madison", "Manilla", "Marengo", "Marion",
        "Markle", "Markleville", "Marshall", "Martinsville", "Matthews", "Mauckport",
        "McCordsville", "Mecca", "Medaryville", "Medora", "Mellott", "Melody Hill",
        "Memphis", "Mentone", "Meridian Hills", "Merom", "Merrillville", "Metamora", "Mexico",
        "Michiana Shores", "Michigan City", "Michigantown", "Middlebury", "Middletown",
        "Mier", "Milan", "Milford", "Millersburg", "Millhousen", "Milltown", "Milroy",
        "Milton", "Mishawaka", "Mitchell", "Modoc", "Monon", "Monroe", "Monroe City",
        "Monroeville", "Monrovia", "Monterey", "Montezuma", "Montgomery", "Monticello",
        "Montmorenci", "Montpelier", "Mooreland", "Moores Hill", "Mooresville",
        "Morgantown", "Morocco", "Morristown", "Mount Auburn", "Mount Ayr", "Mount Carmel",
        "Mount Etna", "Mount Summit", "Mount Vernon", "Mulberry", "Muncie", "Munster",
        "Napoleon", "Nappanee", "Nashville", "New Albany", "New Amsterdam", "New Carlisle",
        "New Castle", "New Chicago", "New Goshen", "New Harmony", "New Haven", "New Market",
        "New Middletown", "New Palestine", "New Paris", "New Pekin", "New Point", "New Richmond",
        "New Ross", "New Salisbury", "New Trenton", "New Washington", "New Whiteland",
        "Newberry", "Newburgh", "Newport", "Niles", "North Anderson", "North Judson",
        "North Liberty", "North Manchester", "North Salem", "North Vernon", "Northwood",
        "Norris", "Norway", "Nappanee", "Nashville", "Oaktown", "Oden", "Odell", "Odon",
        "Olin", "Onward", "Oolitic", "Orleans", "Ossian", "Otterbein", "Owen", "Parker",
        "Parke County", "Pekin", "Pendleton", "Peru", "Petersburg", "Philo", "Pike",
        "Plainfield", "Plymouth", "Poland", "Portage", "Porter", "Poseyville", "Princeton",
        "Rensselaer", "Richmond", "Rising Sun", "Rochester", "Rockville", "Rome City",
        "Rushville", "Sagamore", "Saint Bernice", "Saint John", "Saint Leon", "Saint Meinrad",
        "Saint Paul", "Saint Peter", "Saint Thomas", "Salem", "Schererville", "Schneider",
        "Scipio", "Shelbyville", "Sheridan", "Shirley", "Shoals", "Silverville", "Simmons",
        "Sittingbourne", "South Bend", "Southport", "South Whitley", "Southwood", "Speed",
        "Spencer", "Spooner", "Spring Grove", "Spring Hill", "Springville", "St. Joe",
        "St. Paul", "St. Peter", "St. Thomas", "Stevenson", "Stilesville", "Stone City",
        "Sullivan", "Sunman", "Switz City", "Tell City", "Terre Haute", "Thorntown", "Tipton",
        "Topeka", "Trafford", "Trenton", "Union City", "Uniondale", "Upland", "Vallonia",
        "Valparaiso", "Van Buren", "Vevay", "Vincennes", "Wabash", "Wakarusa", "Walton",
        "Warren", "Warsaw", "Washington", "Waterloo", "Waveland", "Wawaka", "West Baden Springs",
        "Westfield", "West Harrison", "West Lafayette", "Westville", "Whiteland", "Wickliffe",
        "Winamac", "Winchester", "Winfield", "Woodburn", "Zanesville", "Zionsville"
]

}

In [138]:
def find_city(address):
    if pd.isna(address):
        return None  # Retorna None si la dirección es NaN
    for state, cities in cities_dict.items():
        for city in cities:
            if city in address:
                return city
    return None

# Función para asignar la ciudad a la columna 'city'
def assign_city(df):
    df['city'] = df['address'].apply(find_city)
    return df

google_states_reviews = assign_city(google_states_reviews)

- Visualizamos el dataset

In [139]:
google_states_reviews.sample()

Unnamed: 0,user_id,rating,text,gmap_id,date,name,address,latitude,longitude,category,avg_rating,num_of_reviews,state,main_category,city
34669,1.129518e+20,2,"First and foremost, I will start this by sayin...",0x8626b74c5dd894ef:0x9e7bcc77baffcaed,2013-06-10,Phone Clinic,"Phone Clinic, 17278 Airline Hwy, Prairieville,...",30.324303,-90.979335,Electronics repair shop Cell phone accessory s...,4.1,93,Louisiana,Others,Prairieville


- Exportamos a parquet los archivos de los estados separados para luego analizarlos en eL <I>EDA</I>

In [None]:
indiana_sitios.to_parquet("../../Datasets/merge/indiana_sitios.parquet", engine="pyarrow")
louisiana_sitios.to_parquet("../../Datasets/merge/louisiana_sitios.parquet", engine="pyarrow")

- Exportamos los archivos unidos

In [142]:
# Exportamos
google_states_reviews.to_parquet("../../Datasets/merge/google_states_reviews")