In [1]:
import pandas as pd

# Load and process data for prediction

treino_files = [
    "validation/2024-05-11/treino-2024-05-11_08.json",
    "validation/2024-05-13/treino-2024-05-13_13.json",
    "validation/2024-05-11/treino-2024-05-11_18.json",
    "validation/2024-05-11/treino-2024-05-11_23.json"
]

pretest_files = [
    "validation/2024-05-11/2024-05-11_07.json",
    "validation/2024-05-13/2024-05-13_12.json",
    "validation/2024-05-11/2024-05-11_17.json",
    "validation/2024-05-11/2024-05-11_22.json"
]


pretest = pd.read_json(pretest_files[1], encoding='latin-1')
treino = pd.read_json(treino_files[1], encoding='latin-1')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
pretest['latitude'] = pretest['latitude'].str.replace(',', '.').astype(float)
pretest['longitude'] = pretest['longitude'].str.replace(',', '.').astype(float)
pretest['linha'] = pretest['linha'].astype(str)

valid_linhas = [
    '483', '864', '639', '3', '309', '774', '629', '371', '397', '100', '838', 
    '315', '624', '388', '918', '665', '328', '497', '878', '355', '138', '606', 
    '457', '550', '803', '917', '638', '2336', '399', '298', '867', '553', '565', 
    '422', '756', '186012003', '292', '554', '634', '232', '415', '2803', '324', 
    '852', '557', '759', '343', '779', '905', '108'
]

df_pretest = pretest[pretest['linha'].isin(valid_linhas)]

In [3]:
df_last_two = df_pretest.groupby(['ordem', 'linha']).tail(2).reset_index(drop=True)

counts = df_last_two.groupby(['ordem', 'linha']).size()
to_duplicate = counts[counts == 1].index

duplicated_rows = df_last_two.set_index(['ordem', 'linha']).loc[to_duplicate].reset_index()
df_last_two = pd.concat([df_last_two, duplicated_rows]).sort_values(['ordem', 'linha'])


df_last_two = df_last_two[['ordem','linha','latitude','longitude','datahoraservidor']]
df_last_two.head(10)

Unnamed: 0,ordem,linha,latitude,longitude,datahoraservidor
1152,A29021,292,-22.88398,-43.23668,1715612389000
2356,A29021,292,-22.88665,-43.23409,1715612420000
807,A29027,292,-22.90184,-43.18099,1715612358000
1911,A29027,292,-22.90184,-43.18099,1715612389000
1151,A29032,292,-22.9068,-43.18103,1715612389000
2355,A29032,292,-22.90643,-43.17876,1715612420000
1153,A29046,292,-22.90647,-43.1791,1715612389000
2363,A29046,292,-22.90644,-43.1783,1715612420000
535,A29065,457,-22.90759,-43.27196,1715612358000
1576,A29065,457,-22.90631,-43.27079,1715612389000


In [4]:
# Load and process data for training
treino['linha'] = treino['linha'].astype(str)

df_treino = treino[treino['linha'].isin(valid_linhas)]

# Join the two dataframes
join_df = pd.merge(df_treino, df_last_two, on=['ordem','linha'], how='inner')
join_df

Unnamed: 0,id,ordem,linha,datahora,latitude,longitude,datahoraservidor
0,1,B51503,917,1715612396000,-22.88219,-43.32685,1715612380000
1,1,B51503,917,1715612396000,-22.88206,-43.32547,1715612411000
2,2,B51558,624,1715612395000,-22.89998,-43.22958,1715612380000
3,2,B51558,624,1715612395000,-22.90037,-43.23099,1715612411000
4,3,B51573,917,1715612393000,-22.88306,-43.29619,1715612380000
...,...,...,...,...,...,...,...
310871,156640,B31083,483,1715615993000,-22.89362,-43.21548,1715612394000
310872,156641,B31089,483,1715615994000,-22.91135,-43.16892,1715612364000
310873,156641,B31089,483,1715615994000,-22.90865,-43.16830,1715612394000
310874,156642,B31130,483,1715615995000,-22.92101,-43.17155,1715612364000


In [5]:
from sqlalchemy import create_engine
from sqlalchemy import text
import os

database_url = os.getenv("DATABASE_URL")
engine = create_engine(database_url, client_encoding='latin-1')  

In [12]:
def execute_query(connection, linha, lat1, lon1, lat2, lon2, date):
    query = """
    WITH initial_similar_points AS 
        SELECT time_ranking,
               ordem,
               linha,
               x,
               y,
               datahoraservidor
        FROM vw_buses_order
        WHERE linha = :linha
        AND x = width_bucket(:lon1, -43.726090, -42.951470, 1587)
        AND y = width_bucket(:lat1, -23.170790, -22.546410, 1389)
        AND (
                (datahoraservidor >= TO_TIMESTAMP(:actual_date) - interval '7 day' - interval '2 hour'  
                AND datahoraservidor < TO_TIMESTAMP(:actual_date) - interval '7 day' + interval '2 hour') 
                OR 
                (datahoraservidor >= TO_TIMESTAMP(:actual_date) - interval '14 day' - interval '2 hour'  
                AND datahoraservidor < TO_TIMESTAMP(:actual_date) - interval '14 day' + interval '2 hour')
                OR 
                (datahoraservidor >= TO_TIMESTAMP(:actual_date) - interval '21 day' - interval '2 hour'  
                AND datahoraservidor < TO_TIMESTAMP(:actual_date) - interval '21 day' + interval '2 hour')
            )
        AND time_ranking > 1
        LIMIT 10
    ), anterior_points AS (
        SELECT DISTINCT ON (time_ranking, ordem, linha) 
            time_ranking,
            ordem,
            linha,
            x,
            y,
            datahoraservidor
        FROM vw_buses_order
        WHERE (ordem, linha, time_ranking) IN (
            SELECT ordem, linha, time_ranking - 1
            FROM initial_similar_points
            )
    ), direction_points AS (
         SELECT 
            sp.*,
            ((ap.x - sp.x) * (:lon2 - :lon1) + (ap.y - sp.y) * (:lat2 - :lat1)) AS dot_product
        FROM initial_similar_points sp
        INNER JOIN anterior_points ap
            ON sp.ordem = ap.ordem
            AND sp.linha = ap.linha
        WHERE ((ap.x - sp.x) * (:lon2 - :lon1) + (ap.y - sp.y) * (:lat2 - :lat1)) >= 0
    ), first_future_points AS (
        SELECT 
            vo.x,
            vo.y,
            ROW_NUMBER() OVER (PARTITION BY vo.ordem, vo.linha ORDER BY vo.datahoraservidor) AS rn
        FROM vw_buses_order vo
        INNER JOIN direction_points dp
            ON vo.ordem = dp.ordem
            AND vo.linha = dp.linha
        WHERE vo.datahoraservidor > dp.datahoraservidor + interval '25 minutes'
        AND vo.datahoraservidor < dp.datahoraservidor + interval '35 minutes'
    ), selected_future_points AS (
        SELECT x,y
        FROM first_future_points
        WHERE rn = 1
    )
    SELECT 
        ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x)) AS median_x,
        ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY y)) AS median_y
    FROM selected_future_points;
    """
    
    params = {
        'linha': linha,
        'lat1': lat1,
        'lon1': lon1,
        'lat2': lat2,
        'lon2': lon2,
        'actual_date': str(date)
    }
    
    
    result = connection.execute(text(query), params)
    row = result.fetchone()
        
    return row[0], row[1]

In [15]:
median_x_list = []
median_y_list = []

with engine.connect() as connection:
    print(len(join_df)//1000)
    for i in range(0, len(join_df)//1000- 1, 2):
        row1 = join_df.iloc[i + 1]
        row2 = join_df.iloc[i]

        #print(i)
        median_x, median_y = execute_query(
            connection,
            row1['linha'], 
            row1['latitude'], 
            row1['longitude'], 
            row2['latitude'], 
            row2['longitude'], 
            row1['datahora']/1000 # Convert to seconds
        )

        median_x_list.extend([median_x, median_x])
        median_y_list.extend([median_y, median_y])

join_df['median_x'] = median_x_list
join_df['median_y'] = median_y_list

310


ValueError: Length of values (310) does not match length of index (310876)

In [8]:
median_x_list

[941.0,
 941.0,
 891.0,
 891.0,
 964.0,
 964.0,
 900.0,
 900.0,
 960.0,
 960.0,
 817.0,
 817.0,
 1100.0,
 1100.0,
 756.0,
 756.0,
 1056.0,
 1056.0,
 779.0,
 779.0,
 1056.0,
 1056.0,
 879.0,
 879.0,
 1057.0,
 1057.0,
 None,
 None,
 825.0,
 825.0,
 794.0,
 794.0,
 91.0,
 91.0,
 794.0,
 794.0,
 755.0,
 755.0,
 794.0,
 794.0,
 91.0,
 91.0,
 794.0,
 794.0,
 794.0,
 794.0,
 748.0,
 748.0,
 424.0,
 424.0,
 244.0,
 244.0,
 624.0,
 624.0,
 748.0,
 748.0,
 794.0,
 794.0,
 748.0,
 748.0,
 483.0,
 483.0,
 565.0,
 565.0,
 748.0,
 748.0,
 1107.0,
 1107.0,
 480.0,
 480.0,
 815.0,
 815.0,
 871.0,
 871.0,
 872.0,
 872.0,
 1126.0,
 1126.0,
 None,
 None,
 1119.0,
 1119.0,
 1006.0,
 1006.0,
 1114.0,
 1114.0,
 1057.0,
 1057.0,
 1104.0,
 1104.0,
 1028.0,
 1028.0,
 1119.0,
 1119.0,
 1117.0,
 1117.0,
 1057.0,
 1057.0,
 1011.0,
 1011.0,
 902.0,
 902.0,
 986.0,
 986.0,
 85.0,
 85.0,
 130.0,
 130.0,
 117.0,
 117.0,
 1115.0,
 1115.0,
 None,
 None,
 1143.0,
 1143.0,
 991.0,
 991.0,
 1075.0,
 1075.0,
 1112.0,
 1112

In [14]:
median_x_list

[941.0,
 941.0,
 891.0,
 891.0,
 964.0,
 964.0,
 900.0,
 900.0,
 960.0,
 960.0,
 817.0,
 817.0,
 1100.0,
 1100.0,
 756.0,
 756.0,
 1056.0,
 1056.0,
 779.0,
 779.0,
 1056.0,
 1056.0,
 879.0,
 879.0,
 1057.0,
 1057.0,
 None,
 None,
 825.0,
 825.0,
 794.0,
 794.0,
 91.0,
 91.0,
 794.0,
 794.0,
 755.0,
 755.0,
 794.0,
 794.0,
 91.0,
 91.0,
 794.0,
 794.0,
 794.0,
 794.0,
 748.0,
 748.0,
 424.0,
 424.0,
 244.0,
 244.0,
 624.0,
 624.0,
 748.0,
 748.0,
 794.0,
 794.0,
 748.0,
 748.0,
 483.0,
 483.0,
 565.0,
 565.0,
 748.0,
 748.0,
 1107.0,
 1107.0,
 480.0,
 480.0,
 815.0,
 815.0,
 871.0,
 871.0,
 872.0,
 872.0,
 1126.0,
 1126.0,
 None,
 None,
 1119.0,
 1119.0,
 1006.0,
 1006.0,
 1114.0,
 1114.0,
 1057.0,
 1057.0,
 1104.0,
 1104.0,
 1028.0,
 1028.0,
 1119.0,
 1119.0,
 1117.0,
 1117.0,
 1057.0,
 1057.0,
 1011.0,
 1011.0,
 902.0,
 902.0,
 986.0,
 986.0,
 85.0,
 85.0,
 130.0,
 130.0,
 117.0,
 117.0,
 1115.0,
 1115.0,
 None,
 None,
 1143.0,
 1143.0,
 991.0,
 991.0,
 1075.0,
 1075.0,
 1112.0,
 1112

In [None]:
join_df.isnull().sum()

id                0
ordem             0
linha             0
datahora          0
latitude          0
longitude         0
median_x     310876
median_y     310876
dtype: int64