# Descriptive analysis: Dashboard

### Import libraries

In [37]:
from utils import *

from time import strftime, gmtime
from datetime import datetime
import os
from hdfs import InsecureClient

import re
import os
from time import strftime, gmtime
from datetime import datetime
import shutil

import findspark
import warnings
from pyspark.sql.functions import split, col, avg, count
from pyspark.ml.feature import FeatureHasher

import pyspark as py
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

### Read login information

In [38]:
%run utils.py

log = log_config("My.log")
logging_info = logging_creation()

user_name = logging_info["user_name"]
host = logging_info["host"]

### Download data from formatted zone

In [39]:
%run data_SelectorsAndFormatters.py

inp_path="model_temp"
master_files = {
    "hotels": {"Description": "Information about hotels in neighbourhoods"},
    "renda_familiar": {
        "keywords": ["renda_familiar"],
        "Description": "Data from Open Barcelona with incomes of families. Can be joined with 'idealista' file using a lookup file",
    },
    "idealista": {
        "keywords": ["idealista"],
        "Description": "Appartments from idealista. Can be joined with 'renta familiar' file using a lookup file",
    },
    "lookup_renta_idealista": {
        "keywords": ["extended"],
        "Description": "Lookup datable to join 'Idealista' and 'renda familiar'",
    },
}

In [None]:
# Load last versions of all dataframes from explotation zone
for key_file in master_files.keys():
    formatted_data_selector(log, user_name, host, key_file)

### Load dataframes

In [108]:
import findspark
findspark.init("/Users/yyf/Documents/Spark")

from pyspark import SparkConf
from pyspark.sql import SparkSession
import os

conf = SparkConf().setMaster("local").setAppName("Dashboard")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

### Preprocessing

In [109]:
inp_path = "model_temp"  # make sure to define inp_path
file_names = os.listdir(inp_path)
dfs = {}

for file in file_names:
    full_in_path = os.path.join(inp_path, file)
    # Assuming the naming pattern is consistent and you're interested in the part after the second dash
    short_name = file.split("-", maxsplit=2)[2].replace(".parquet", "")
    dfs[short_name] = spark.read.option("multiline", "true").format("parquet").load(full_in_path)

In [110]:
selected_fields = {
    "renda_familiar": [
            "Nom_Districte",
            "Codi_Districte",
            "Índex RFD Barcelona = 100",
        ],
        "idealista": ["district", "priceByArea","price"],
        "lookup_renta_idealista": ["district", "district_id"],
    }

In [111]:
from pyspark.sql.functions import col, split

# Dropping rows with null values in selected fields
for key in dfs:
    if key in selected_fields:
        dfs[key] = dfs[key].na.drop(subset=selected_fields[key])

# modify the certain variables, delete ""
dfs["renda_familiar"] = dfs["renda_familiar"].withColumn(
    "Nom_Districte", split(col("Nom_Districte"), '"')[1]
).withColumn(
    "Índex RFD Barcelona = 100", split(col("Índex RFD Barcelona = 100"), '"')[1]
)

### Upload preprocessed dataframe to the explotation zone

In [119]:
from pyspark.sql.functions import col

dfs["dashboard"] = dfs["idealista"].join(
    dfs["renda_familiar"],
    col("district") == col("Nom_Districte"),  
    "inner"
).select(
    col("district").alias("district_idealista"),  
    col("priceByArea").alias("priceByArea_idealista"),
    col("price").alias("price_idealista"),
    col("Nom_Districte").alias("Nom_Districte_renda"),  
    col("Codi_Districte").alias("Codi_Districte_renda"),  
    col("Índex RFD Barcelona = 100").alias("Index_RFD_renda")  
)

In [120]:
model_name = "dashboard.parquet"

explotation_zone_path = "user/bdm/Model_explotation_zone_dashboard" 

In [121]:
hdfs_client = InsecureClient(host, user=user_name)
out_full_path = explotation_zone_path + "/" + model_name 

dfs["dashboard"].write.parquet(model_name)

hdfs_client.upload(out_full_path, model_name, overwrite=True)

log.info(f"Model {file} uploaded correctly at '{out_full_path}' path")

### Convert to csv to do the dashboard in Tableau

In [None]:
import os

output_path = "csv"

# Save 'renda_familiar' DataFrame with selected columns
dfs["renda_familiar"].select(
    "Nom_Districte",
    "Codi_Districte",
    "Índex RFD Barcelona = 100"
).write.option("header", True).mode("overwrite").csv(os.path.join(output_path, "renda_familiar"))

# Save 'idealista' DataFrame with selected columns
dfs["idealista"].select(
    "district",
    "priceByArea",
    "price"
).write.option("header", True).mode("overwrite").csv(os.path.join(output_path, "idealista"))

# Save 'lookup_renta_idealista' DataFrame with selected columns
dfs["lookup_renta_idealista"].select(
    "district",
    "district_id"
).write.option("header", True).mode("overwrite").csv(os.path.join(output_path, "lookup_renta_idealista"))