In [1]:
import time
import numpy as np

import pandas as pd
import geopandas as gpd

import seaborn as sns
import matplotlib.pyplot as plt

from shapely import wkt

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('')

In [33]:
df = pd.read_csv('./data/raw/dubai_re_transactions.csv')
area = gpd.read_file('./data/raw/dubai_land_areas.geojson')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1397742 entries, 0 to 1397741
Data columns (total 27 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   transaction_id        1397742 non-null  object        
 1   procedure_id          1397742 non-null  int64         
 2   trans_group_id        1397742 non-null  int64         
 3   trans_group           1397742 non-null  object        
 4   procedure_name        1397742 non-null  object        
 5   instance_date         1397738 non-null  datetime64[ns]
 6   property_type_id      1397742 non-null  int64         
 7   property_type         1397742 non-null  object        
 8   property_sub_type_id  1099616 non-null  float64       
 9   property_sub_type     1099616 non-null  object        
 10  property_usage        1397742 non-null  object        
 11  reg_type_id           1397742 non-null  int64         
 12  reg_type              1397742 non-null  ob

### 1. Removing unneccessary columns & cleaning

In [35]:
df.drop(columns=[col for col in df.columns if col.endswith('_ar')], inplace=True)
df.rename(columns=lambda x: x[:-3] if x.endswith('_en') else x, inplace=True)

df.drop(columns=['rent_value', 'meter_rent_price', 'no_of_parties_role_1', 'no_of_parties_role_2', 'no_of_parties_role_3'], inplace=True)

In [36]:
df['instance_date'] = pd.to_datetime(df['instance_date'], dayfirst=True, errors='coerce')

In [35]:
df.to_csv('./data/dubai_re_transactions_adjusted.csv')

### 2. Finding the Area Plygons

In [38]:
df['area_name'] = df['area_name'].str.lower().str.strip()
area['CNAME_E'] = area['CNAME_E'].str.lower().str.strip()

In [39]:
area = area[area['CNAME_E'].isin(df['area_name'])]

In [130]:
area.to_file('./data/dubai_transactions_land_area.geojson', driver='GeoJSON')

### 2. Making Columns Convenient to Work With 

In [14]:
df = pd.read_csv('./data/dubai_re_transactions_adjusted.csv')

In [11]:
df['actual_worth'] = df['actual_worth'] / 1000000

df.rename(columns={'actual_worth':'actual_worth_millions'}, inplace=True)

In [13]:
df.to_csv('./data/dubai_re_transactions_adjusted.csv', index=0)

### 3. Extracting Valuable Aggregation

In [22]:
df = pd.read_csv('./data/processed/dubai_re_transactions_adjusted.csv')

In [31]:
total_worth_per_area  = df.groupby(["area_id", "geometry"])["actual_worth_millions"].sum().reset_index()


total_worth_per_area['geometry'] = total_worth_per_area['geometry'].apply(lambda geom: wkt.loads(geom) if isinstance(geom, str) else None)
total_worth_per_area = gpd.GeoDataFrame(total_worth_per_area, geometry='geometry', crs='EPSG:4326')

In [32]:
total_worth_per_area.to_file('./data/processed/total_worth_per_area.geojson')