# Data Retrieval

## Configuration

In [None]:
# Librairies

import pandas as pd

## Global

In [None]:
# Constants

from constants import *

## Retrieval

In [None]:
# Functions

def analyse(data):
	nas = data.isna().sum()
	nuniques = data.nunique()
	types = data.dtypes
	
	return pd.DataFrame({
		"NA": nas,
		"Uniques": nuniques,
		"types": types
	})

def clean(data):
	# Data Cleaning
	data[DATA_ADRESSE_SUFFIXE_COLUMN] = data[DATA_ADRESSE_SUFFIXE_COLUMN].fillna("")
	data = data.dropna(subset=DATA_IMPORTANT_COLUMNS)
	
	# Data Transformation
	data[DATA_DATE_COLUMN] = pd.to_datetime(data[DATA_DATE_COLUMN], yearfirst="True")
	data[DATA_YEAR_COLUMN] = data[DATA_DATE_COLUMN].dt.year
	data[DATA_MONTH_COLUMN] = data[DATA_DATE_COLUMN].dt.month

	# Data Selection
	data = data[data[DATA_NATURE_MUTATION_COLUMN] == DATA_NATURE_MUTATION_VENTE_VALUE]
	data = data[data[DATA_TYPE_LOCAL_COLUMN].isin(DATA_TYPE_LOCAL_VALUE)]

	return data

def sample(data):
	data["classes_"] = data[DATA_CLASS_COLUMNS].astype(str).agg("_".join, axis=1)
	sample = data.groupby("classes_", group_keys=False).apply(
		lambda c: c.sample(min(len(c), DATA_CLASS_SIZE), random_state=42)
	)
	sample = sample.drop("classes_", axis=1)

	return sample

def merge(data_chunks, clean_func, columns):
	data = pd.DataFrame(columns=columns)

	tolerance = 5
	size = 0
	for data_chunk in data_chunks:
		data_chunk = clean_func(data_chunk)
		data = pd.concat([data_chunk, data])
		data = sample(data)

		if data.shape[0] >= DATA_SIZE: break
		if data.shape[0] == size:
			tolerance -= 1
			if tolerance == 0: break
		
		size = data.shape[0]
	
	data = data.drop([DATA_NATURE_MUTATION_COLUMN], axis=1)
	data = data.sort_values(DATA_DATE_COLUMN).reset_index(drop=True)

	return data

#### Real Estate Data

In [None]:
# Parameters

columns = DATA_COLUMNS

In [None]:
# Retrieval

data_chunks = pd.read_csv(DATA_FILE_PATH, sep=DATA_SEP_CHAR, chunksize=DATA_CHUNK_SIZE, usecols=columns)

data = merge(data_chunks, clean, DATA_COLUMNS)

In [None]:
# Analysis

analyse(data)

In [None]:
# Visualization

data

In [None]:
data.groupby(DATA_CLASS_COLUMNS)[DATA_CODE_COMMUNE_COLUMN].count()

In [None]:
# Save

data[DATA_MAIN_CLASSES].to_csv(DATA_CLEAN_FILE_PATH, index=False)