In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
from joblib import dump, load

module_path = os.path.abspath(os.path.join('..', 'code'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import utils

## Procesado de datos Oferta de cursos

In [4]:
df = utils.load_data('../data/courses.csv')

In [5]:
def initial_filter(df):
    df = df[df['Lugar']=='CAMPUS PRINCIPAL']
    # Mantener solo los edificos usados para la investigación
    df = df[df['Edificio'].str.contains(r'.*SD.*|.*\.centro.*|.*ML.*|.*LL.*|.*(O).*|.*Bloque C.*|.*Aulas.*|.*(GA).*', na=False)]
    # Eliminar filas duplicadas
    df.drop_duplicates(inplace=True)
    
    return df

In [6]:
def further_processing(df):
    # Mantener solo las columnas necesarias
    df = df[['Curso','NRC','Periodo','Edificio','Salón','Días','Horas', 'Inscritos']]
    # Eliminar cursos sin inscritos
    df = df[df['Inscritos']>0]
    return df

In [7]:
def save_file(processed_df, path):
    processed_df.to_csv(path, index=False)

In [8]:
def process_file(input_path, output_path):
    df = utils.load_data(input_path)
    df = initial_filter(df)
    df = further_processing(df)
    save_file(df, output_path)

In [9]:
dump(process_file, '../models/process_file.joblib')

['../models/process_file.joblib']

In [10]:
process_file_loaded = load('../models/process_file.joblib')

process_file_loaded('../data/courses.csv', '../data/processed_courses.csv')



  df = df[df['Edificio'].str.contains(r'.*SD.*|.*\.centro.*|.*ML.*|.*LL.*|.*(O).*|.*Bloque C.*|.*Aulas.*|.*(GA).*', na=False)]


### Usando los datos de talanqueras

In [11]:
import pandas as pd
import numpy as np

In [12]:
courses = pd.read_csv('../data/processed_courses.csv')
talanqueras = pd.read_csv('../data/output.csv')

In [5]:
courses.shape

(6935, 8)

In [13]:
talanqueras['Edificio'] = talanqueras['Puerta'].str.split('-', expand=True)[0]
talanqueras['Edificio'] = talanqueras['Edificio'].str.replace(r'[0-9]', '', regex=True)

courses['Salón'] = courses['Salón'].str.replace('.','')
courses['Edificio'] = courses['Salón'].str.split('_', expand=True)[0]

In [4]:
courses.sample(10)

Unnamed: 0,Curso,NRC,Periodo,Edificio,Salón,Días,Horas,Inscritos
0,ADMI-1101 FUNDAMENTOS DE ADMINISTRACION Y GERE...,39342,202420,O,O_103,L,0800-0920,81
4550,LENG-1155 ENGLISH 05 (CICLO 1 DE 8 SEMANAS),27923,202420,AU,AU_209,M,1230-1350,19
5102,LENG-1201 FRANCES 1,68824,202420,O,O_304,I,0930-1050,20
465,"ARQT-2103 UNID.NÚC.PROY,DE LA CONFIGURAC.DE LA...",52165,202420,C,C_305,J,1000-1250,11
69,ADMI-1412 CONTABILIDAD FINANCIERA,72601,202420,ML,ML_515,J,0930-1050,45
655,ARQT-2447 PRACTICAS PROYECTUALES Y MUJERES (CI...,70170,202420,C,C_408,I,1100-1250,14
4185,ISIS-1226 DISEÑO Y PROGRAMACIÓN O.O.,48019,202420,LL,LL_101,M,0800-0920,26
3862,IMEC-3345 SISTEMAS CONVERSION DE ENERGIA,72676,202420,ML,ML_615,J,1230-1350,60
1341,CISO-4842 SEMINARIO SOCIOLOGÍA RURAL,70796,202420,RGD,RGD_312-13,S,0800-1050,9
5463,MADM-4151 EMPRENDIMIENTO E INNOVACION (CICLO 2...,71420,202420,SD,SD_702,S,0800-1050,30


In [6]:
def count_entradas(edificio):
    return talanqueras['Edificio'].str.contains(edificio).sum()

In [7]:
edif = pd.DataFrame()
edif['Edificio'] = courses['Edificio'].unique()
for edificio in edif['Edificio']:
    edif.loc[edif['Edificio']==edificio, 'Entradas'] = count_entradas(edificio)
edif.head()  #  Conteo de entradas por edificio

Unnamed: 0,Edificio,Entradas
0,O,0.0
1,RGD,10088.0
2,ML,35062.0
3,AU,0.0
4,SD,35198.0


In [8]:
# df para contar entradas por rangos de horas
hours_count = pd.DataFrame()
hours_count['Edificio'] = courses['Edificio']
hours_count['Rango de Horas'] = talanqueras['FechaHora'].str.split(' ', expand=True)[1].str.split(':', expand=True)[0].astype(float) 

In [9]:
hours_count.head()

Unnamed: 0,Edificio,Rango de Horas
0,O,7.0
1,O,6.0
2,O,7.0
3,O,12.0
4,O,15.0


In [44]:
import numpy as np

def create_ranges(df):
    earliest = float(hours_count['Rango de Horas'].min())
    latest = float(hours_count['Rango de Horas'].max())
    df['Rango de Horas'] = pd.cut(df['Rango de Horas'], bins=np.arange(earliest, latest, 0.5), right=False)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

create_ranges(hours_count)

Unnamed: 0,Edificio,Rango de Horas
0,RGD,"[7.0, 7.5)"
1,RGD,"[6.0, 6.5)"
2,ML,"[7.0, 7.5)"
3,ML,"[12.0, 12.5)"
4,RGD,"[15.0, 15.5)"
...,...,...
86,ML,"[4.0, 4.5)"
87,LL,"[2.0, 2.5)"
88,RGD,"[2.0, 2.5)"
89,LL,"[1.0, 1.5)"


In [45]:
#count entries by hour
hours_count['Entradas'] = 0
for i in range(len(hours_count)):
    edificio = hours_count['Edificio'].iloc[i]
    left = hours_count['Rango de Horas'].iloc[i].left
    right = hours_count['Rango de Horas'].iloc[i].right
    for j in range(len(talanqueras)):
        hora = int(talanqueras['FechaHora'].iloc[j].split(' ')[1].split(':')[0])
        building = talanqueras['Edificio'].iloc[j]
        if (hora == left or hora == right) and building == edificio:
            hours_count['Entradas'].iloc[i] += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hours_count['Entradas'].iloc[i] += 1


In [46]:
#Ordenar por edificio y rango de horas
hours_count = hours_count.sort_values(by=["Edificio", "Rango de Horas"])

In [47]:
hours_count

Unnamed: 0,Edificio,Rango de Horas,Entradas
90,LL,"[0.0, 0.5)",22
89,LL,"[1.0, 1.5)",34
87,LL,"[2.0, 2.5)",27
68,LL,"[3.0, 3.5)",26
83,LL,"[4.0, 4.5)",25
...,...,...,...
35,SD,"[18.0, 18.5)",1043
50,SD,"[19.0, 19.5)",1087
30,SD,"[20.0, 20.5)",957
16,SD,"[21.0, 21.5)",681


In [48]:
# Exportamos los 2 df a csv
edif.to_csv('../data/entradas_edificio.csv', index=False)
hours_count.to_csv('../data/entradas_edificio_hora.csv', index=False)

## Agrupacion de datos usando oferta de cursos

In [14]:
new = courses.copy()

In [15]:
new

Unnamed: 0,Curso,NRC,Periodo,Edificio,Salón,Días,Horas,Inscritos
0,ADMI-1101 FUNDAMENTOS DE ADMINISTRACION Y GERE...,39342,202420,O,O_103,L,0800-0920,81
1,ADMI-1101 FUNDAMENTOS DE ADMINISTRACION Y GERE...,39342,202420,O,O_103,I,0800-0920,81
2,ADMI-1101 FUNDAMENTOS DE ADMINISTRACION Y GERE...,39342,202420,O,O_103,V,0800-0920,81
3,ADMI-1101 FUNDAMENTOS DE ADMINISTRACION Y GERE...,42986,202420,O,O_103,I,0930-1050,86
4,ADMI-1101 FUNDAMENTOS DE ADMINISTRACION Y GERE...,42986,202420,O,O_103,V,0930-1050,86
...,...,...,...,...,...,...,...,...
6930,SPUB-4545 FUNDAMENTOS DEL DERECHO A LA SALUD,66159,202420,RGD,RGD_311,V,1400-1650,11
6931,SPUB-4545 FUNDAMENTOS DEL DERECHO A LA SALUD,66159,202420,RGD,RGD_311,V,1400-1650,11
6932,SPUB-4545 FUNDAMENTOS DEL DERECHO A LA SALUD,66159,202420,RGD,RGD_311,V,1400-1650,11
6933,SPUB-4545 FUNDAMENTOS DEL DERECHO A LA SALUD,66159,202420,RGD,RGD_311,V,1400-1650,11


In [16]:
new.drop(columns=['Periodo','Salón'], inplace=True)

In [17]:
#Se filtran los cursos que tienen clase los miercoles (I)
new = new[new['Días'].str.contains('I')]

In [18]:
new.shape

(1364, 6)

In [19]:
new.sort_values(by=['Edificio','Horas'], inplace=True)
new.drop_duplicates( keep='first', inplace=True)
new.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new.sort_values(by=['Edificio','Horas'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new.drop_duplicates( keep='first', inplace=True)


In [20]:
def convert_to_range(horas):
    # Split the start and end times
    start, end = horas.split("-")

    start_hour = int(start[:2])  # First two digits (hours)
    end_hour = int(end[:2])      # First two digits (hours)
    if end[2:] != '00':
        end_hour += 1

    return[start_hour, end_hour]

# Apply the conversion function
new["Rango de Horas"] = new["Horas"].apply(convert_to_range)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new["Rango de Horas"] = new["Horas"].apply(convert_to_range)


In [21]:
new.drop(columns=['Horas'], inplace=True)
new['Rango de Horas'] = new['Rango de Horas'].apply(tuple)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new.drop(columns=['Horas'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['Rango de Horas'] = new['Rango de Horas'].apply(tuple)


In [22]:
#Nos da el numero de estudiantes inscritos en un curso por edificio
inscritos = new.groupby('Edificio', as_index=False)['Inscritos'].sum()

In [27]:
new

Unnamed: 0,Curso,NRC,Edificio,Días,Inscritos,Rango de Horas,Index,_6
0,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(6, 8)",,
1,ISIS-1404 TI EN LAS ORGANIZACIONES (TIPO E),12125,AU,I,22,"(6, 8)",,
2,ISIS-2503 ARQUITECTURA Y DISEÑO DE SOFTWARE,63026,AU,I,31,"(6, 8)",,
3,LENG-1157 ENGLISH 07 - SPEAKING 1 (CICLO 1 DE ...,27929,AU,I,21,"(6, 8)",,
4,LENG-1157 ENGLISH 07 - SPEAKING 1 (CICLO 1 DE ...,27931,AU,I,19,"(6, 8)",,
...,...,...,...,...,...,...,...,...
6483,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(19.5, 20.0)",1243.0,"(18, 22)"
6484,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(20.0, 20.5)",1243.0,"(18, 22)"
6485,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(20.5, 21.0)",1243.0,"(18, 22)"
6486,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(21.0, 21.5)",1243.0,"(18, 22)"


In [24]:
# Vamos a generar todos los rangos de horas posibles por periodos de media hora
all_ranges = []
earliest = new['Rango de Horas'].apply(lambda x: x[0]).min()
latest = new['Rango de Horas'].apply(lambda x: x[1]).max()
current = earliest
while current < latest:
    all_ranges.append((current, current + 0.5))
    current += 0.5

In [25]:
def count_missing_ranges(start, end, ranges):
    # Count the number of missing ranges
    count = 0
    start_index = 0
    end_index = 0
    for r in ranges:
        if r[0] == start:
            start_index = ranges.index(r)
        if r[1] == end:
            end_index = ranges.index(r)
    count = end_index - start_index + 1
    return (count, start_index, end_index)

def create_row(df, row, start_index, end_index, ranges):
    # Create new rows for the missing ranges and add them to a list
    new_rows = []
    for i in range(start_index, end_index + 1):
        new_row = row.copy()
        new_row['Rango de Horas'] = ranges[i]
        new_row['Inscritos'] = row["Inscritos"]  
        new_rows.append(new_row)  # Append the row to the list
    
    # Concatenate the new rows to the DataFrame at once
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    return df
    

In [26]:
#Se ajustan los rangos de horas con respecto a los que se acaban de generar
for row in new.itertuples():
    start = row[6][0]  # Extract start time from 'Rango de Horas'
    end = row[6][1]    # Extract end time from 'Rango de Horas'
    
    count, start_index, end_index = count_missing_ranges(start, end, all_ranges)
    # If there are missing ranges, add new rows
    if count > 1:
        new = create_row(new, row._asdict(), start_index, end_index, all_ranges)

In [28]:
#Se borran las columnas que no se necesitan y se filtran los cursos que tienen un rango de horas por cada media hora
new.drop(columns=['Index','_6'], inplace=True)
filtered_df = new[new['Rango de Horas'].apply(lambda x: isinstance(x[0], float) and isinstance(x[1], float))]

In [29]:
filtered_df

Unnamed: 0,Curso,NRC,Edificio,Días,Inscritos,Rango de Horas
1245,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(6.5, 7.0)"
1246,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(7.0, 7.5)"
1247,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(7.5, 8.0)"
1249,ISIS-1404 TI EN LAS ORGANIZACIONES (TIPO E),12125,AU,I,22,"(6.5, 7.0)"
1250,ISIS-1404 TI EN LAS ORGANIZACIONES (TIPO E),12125,AU,I,22,"(7.0, 7.5)"
...,...,...,...,...,...,...
6483,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(19.5, 20.0)"
6484,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(20.0, 20.5)"
6485,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(20.5, 21.0)"
6486,MIIA-4100 MODELOS DE ANÁLISIS ESTADÍSTICO,71564,SD,I,41,"(21.0, 21.5)"


In [30]:
filtered_df.to_csv('../data/entradas_oferta.csv', index=False)

In [33]:
filtered_df = filtered_df.groupby(['Edificio', 'Rango de Horas'], as_index=False)['Inscritos'].sum()

In [35]:
filtered_df.to_csv('../data/entradas_edificio_hora_completos.csv', index=False)

## Para llenar los rangos faltantes

In [84]:
inscritos.to_csv('../data/inscritos_edificio.csv')

In [63]:
import pandas as pd
import numpy as np


df = pd.read_csv('../data/entradas_edificio_hora.csv')

# Convert "Rango de Horas" to numeric ranges for easier processing
def parse_range(range_str):
    range_str = range_str.replace("(", "[").replace(")", "]").strip()
    start, end = map(float, range_str.strip("[]").split(", "))
    return start, end

df[["Start", "End"]] = df["Rango de Horas"].apply(parse_range).apply(pd.Series)

# Initialize an empty DataFrame to store results
all_ranges = []

# Process each unique "Edificio" separately
for edificio in df["Edificio"].unique():
    # Filter data for the current building
    edificio_df = df[df["Edificio"] == edificio]

    # Generate a complete range of intervals for this building
    start_min = edificio_df["Start"].min()
    end_max = edificio_df["End"].max()
    interval = 0.5
    complete_ranges = pd.DataFrame({
        "Start": np.arange(start_min, end_max, interval),
        "End": np.arange(start_min + interval, end_max + interval, interval),
        "Edificio": edificio  # Add the building identifier
    })

    # Merge with the filtered DataFrame
    merged = complete_ranges.merge(edificio_df, on=["Start", "End"], how="left")

    # Drop the redundant "Edificio_x" or "Edificio_y" columns and keep the correct one
    merged["Edificio"] = merged["Edificio_x"].fillna(merged["Edificio_y"])  # Select correct column
    merged.drop(columns=["Edificio_x", "Edificio_y"], inplace=True)  # Drop the redundant columns

    # Fill missing "Entradas" with 0
    merged["Entradas"].fillna(0, inplace=True)

    # Add to the results
    all_ranges.append(merged)

# Combine results from all buildings
final_df = pd.concat(all_ranges, ignore_index=True)

# Recreate "Rango de Horas" column for consistency
final_df["Rango de Horas"] = "[" + final_df["Start"].astype(str) + ", " + final_df["End"].astype(str) + "]"

# Drop temporary columns
final_df = final_df[["Edificio", "Rango de Horas", "Entradas"]]

# Sort and display the result
final_df = final_df.sort_values(by=["Edificio", "Rango de Horas"])
final_df.reset_index(drop=True, inplace=True)

In [94]:
for edificio in final_df['Edificio'].unique():
    # Filter rows where 'Edificio' matches and 'Entradas' is 0
    condition = (final_df['Edificio'] == edificio) & (final_df['Entradas'] == 0)
    
    #sacamos el indice que contiene un 0
    index = final_df.loc[condition].index
    
    for idx in index:
        if idx > 0 and idx < len(final_df) - 1:
            anterior = final_df.loc[idx - 1, 'Entradas']
            posterior = final_df.loc[idx + 1, 'Entradas']
            # Compute the mean for the current 'Edificio', ignoring zeros
            mean_value = np.mean([anterior, posterior])
            # Assign the mean value to rows where 'Entradas' is 0
            final_df.loc[idx, 'Entradas'] = mean_value


In [98]:
final_df

Unnamed: 0,Edificio,Rango de Horas,Entradas
0,LL,"[0.0, 0.5]",22.0
1,LL,"[0.5, 1.0]",0.0
2,LL,"[1.0, 1.5]",34.0
3,LL,"[1.5, 2.0]",0.0
4,LL,"[10.0, 10.5]",669.0
...,...,...,...
175,SD,"[7.5, 8.0]",0.0
176,SD,"[8.0, 8.5]",2327.0
177,SD,"[8.5, 9.0]",0.0
178,SD,"[9.0, 9.5]",2442.0


In [96]:
final_df.to_csv('../data/entradas_edificio_hora_completos.csv', index=False)