# Removal Step

In [13]:
import pandas as pd

leases_df = pd.read_csv("../data/Leases.csv")
orig_leases_df = pd.read_csv("../data/Leases.csv")

# Drop na rows for company name
# leases_df = pd.DataFrame(leases_df, columns=["company_name"]).dropna()

# Omit rows with na company_name
# leases_df = leases_df[leases_df["company_name"].notna()]

# Remove columns with all na entries
remove_features = [
    "direct_availability_proportion",
    "direct_internal_class_rent",
    "direct_overall_rent",
    "sublet_available_space",
    "sublet_availability_proportion",
    "sublet_internal_class_rent",
    "sublet_overall_rent",
    "company_name",
    "internal_industry",
    "quarter",
    "transaction_type",
    "market"
]
for feature in remove_features:
    leases_df = leases_df[leases_df[feature].notna()]


leases_by_rba = leases_df["RBA"]
leases_by_rba.head()

leases_by_rba.to_csv("../data/leases_by_rba.csv")

# Drop RBA and space_type
leases_df = leases_df.drop(columns=[
    "RBA", "space_type", "leasing", "monthsigned", "building_name", "building_id", "region", "city"
], axis=1)

leases_df.to_csv("../data/cleaned_leases.csv", index=False)

In [14]:
# Get Q3 rows
# leases_df = leases_df[leases_df["quarter"] == "Q3"]

# Get only construction industries
# leases_df = leases_df[leases_df["internal_industry"].str.contains("Construction", na=False)]

# Get Texas only
# leases_df = leases_df[leases_df["state"] == 'TX']

# Indicator variables for market
market_variables = leases_df["market"].unique()
for var in market_variables:
    leases_df[var] = (leases_df["market"] == var).astype(int)

# Indicator variables fro construction industry
leases_df["is_construction"] = leases_df["internal_industry"].str.contains("Construction").astype(int)
leases_df.to_csv("../data/cleaned_leases.csv", index=False)

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
leases_df = pd.read_csv("../data/cleaned_leases.csv")

# Select features and target variable
features = leases_df.select_dtypes(include=['number']).columns.tolist()
# Remove target if it is in the features
TARGET = "is_construction"
if TARGET in features:
    features.remove(TARGET)
target = leases_df[TARGET]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(leases_df[features], target, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Feature Importance
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance_df)

Mean Squared Error: 0.05919095580678314

Feature Importance:
                           Feature  Importance
2                         leasedSF    0.262631
3                         costarID    0.221372
1                              zip    0.149757
14      sublet_internal_class_rent    0.035307
5          availability_proportion    0.033218
15             sublet_overall_rent    0.033031
13  sublet_availability_proportion    0.030878
10      direct_internal_class_rent    0.029693
9   direct_availability_proportion    0.029396
12          sublet_available_space    0.029285
6              internal_class_rent    0.027226
8           direct_available_space    0.026782
11             direct_overall_rent    0.023836
4                  available_space    0.022331
7                     overall_rent    0.022251
0                             year    0.011271
23                         Houston    0.001888
32                       San Diego    0.001234
16                         Atlanta    0.001198

# View Data

In [145]:
from IPython.display import display, HTML

# Convert DataFrame to HTML and display
# display(HTML(leases_df.to_html()))

In [None]:
import pandas as pd
import plotly.express as px

# Load the dataset
leases_df = pd.read_csv("../data/cleaned_leases.csv")

# Filter for Texas and construction companies
texas_construction = leases_df[(leases_df['state'] == 'TX') & (leases_df['is_construction'] == 1)]

# Create a scatter plot of addresses
fig = px.choropleth(texas_construction,
                     locations='state',
                     locationmode="USA-states",
                     text='address',
                     title='Construction Company Addresses in Texas',
                     scope="usa",
                     color="zip")

fig.update_geos(fitbounds="locations", visible=False)
fig.show()

In [40]:
import pandas as pd
import plotly.express as px
import geopandas as gpd

df = pd.DataFrame({
    'city': ['Houston', 'Dallas', 'San Antonio', 'Austin', 'Fort Worth'],
    'population': [2325502, 1343573, 1547253, 964254, 893446],
    'latitude': [29.7604, 32.7767, 29.4241, 30.2672, 32.7552],
    'longitude': [-95.3698, -96.7970, -98.4936, -97.7431, -97.3308]
})

# Load the Texas shapefile
texas = gpd.read_file('Texas_State_Boundary/State_Boundary.shp')

# Create the base map of Texas
fig = px.choropleth_mapbox(
    texas,
    geojson=texas.geometry,
    locations=texas.index,
    color='value',  # You can use a column from the shapefile if needed
    color_continuous_scale='Viridis_r',
    mapbox_style='carto-positron',
    zoom=5,
    center={'lat': 31.9686, 'lon': -99.9018},
    opacity=0.5,
    labels={'value': 'Value'}
)

# Add the city markers
fig.add_scattermapbox(
    lat=df['latitude'],
    lon=df['longitude'],
    mode='markers',
    marker=dict(size=10, color='red'),
    text=df['city'] + ': ' + df['population'].astype(str),
    hoverinfo='text'
)

# Update layout
fig.update_layout(
    title_text='Population of Major Cities in Texas',
    margin={'r': 0, 't': 50, 'l': 0, 'b': 0},
    mapbox=dict(
        style='carto-positron',
        center=dict(lat=31.9686, lon=-99.9018),
        zoom=5
    )
)

# Show the map
fig.show()


*choropleth_mapbox* is deprecated! Use *choropleth_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['FIPS', 'CREATE_USE', 'CMNT', 'geometry'] but received: value