In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from joblib import dump, load

module_path = os.path.abspath(os.path.join('..', 'code'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import utils
import numpy as np

## Procesado de datos Oferta de cursos

In [38]:
df = utils.load_data('../data/courses.csv')

In [108]:
def initial_filter(df):
    df = df[df['Lugar']=='CAMPUS PRINCIPAL']
    # Agregar columna Edificio a los cursos de deporte que no lo tengan
    df['Edificio'] = df.apply(lambda x:'.Edif. Gata Golosa(GA)' if  x['Curso'].startswith('DEPO') else x['Edificio'],axis=1)
    df['Salón'] = df.apply(lambda x:'GA' if  x['Curso'].startswith('DEPO') else x['Salón'],axis=1)
    # Eliminar filas duplicadas
    df.drop_duplicates(inplace=True)
    
    return df

In [5]:
def further_processing(df):
    # Mantener solo las columnas necesarias
    df = df[['Curso','NRC','Periodo','Edificio','Salón','Días','Horas', 'Inscritos']]
    # Eliminar cursos sin inscritos
    df = df[df['Inscritos']>0]
    return df

In [6]:
def save_file(processed_df, path):
    processed_df.to_csv(path, index=False)

In [30]:
def process_file(input_path, output_path):
    df = utils.load_data(input_path)
    df = initial_filter(df)
    df = further_processing(df)
    save_file(df, output_path)

In [109]:
dump(process_file, '../models/process_file.joblib')

['../models/process_file.joblib']

In [110]:
process_file_loaded = load('../models/process_file.joblib')

process_file_loaded('../data/courses.csv', '../data/processed_courses.csv')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Edificio'] = df.apply(lambda x:'.Edif. Gata Golosa(GA)' if  x['Curso'].startswith('DEPO') else x['Edificio'],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Salón'] = df.apply(lambda x:'GA' if  x['Curso'].startswith('DEPO') else x['Salón'],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


### Usando los datos de talanqueras

In [10]:
import pandas as pd

In [111]:
courses = pd.read_csv('../data/processed_courses.csv')
talanqueras = pd.read_csv('../data/output.csv')

In [5]:
courses.shape

(6935, 8)

In [112]:
talanqueras['Edificio'] = talanqueras['Puerta'].str.split('-', expand=True)[0]
talanqueras['Edificio'] = talanqueras['Edificio'].str.replace(r'[0-9]', '', regex=True)

courses['Salón'] = courses['Salón'].str.replace('.','')
courses['Edificio'] = courses['Salón'].str.split('_', expand=True)[0]

In [47]:
courses.sample(10)

Unnamed: 0,Curso,NRC,Periodo,Edificio,Salón,Días,Horas,Inscritos
2672,"EGOB-3523 POBREZA, DESIGUALDAD Y POLÍTICAS PÚB...",71057,202420,RGD,RGD_308-9,I,0930-1050,33
4796,LENG-1155 ENGLISH 05 (CICLO 1 DE 8 SEMANAS),65894,202420,LL,LL_201,I,0930-1050,21
6686,MGPU-4111 GESTIÓN DE LA CONTRATACIÓN (CICLO 2 ...,35694,202420,RGD,RGD_112-13,V,1430-1820,7
1410,CPER-2201 NARRATIVA SONORA 1 (CICLO 1 DE 8 SEM...,48418,202420,C,C_211,L,1200-1350,19
2408,ECON-3113 MICROECONOMÍA 3 CON TALLER,55279,202420,SD,SD_801,L,0930-1050,4
4488,ISIS-3007 PROYECTO DE GRADO,54565,202420,O,O_104,L,1400-1520,3
4363,ISIS-1226 DISEÑO Y PROGRAMACIÓN O.O.,55645,202420,LL,LL_202,I,0800-0920,31
5224,LENG-1161 ENGLISH 10A SPEAKING 2 FOR ACADEMIC ...,30768,202420,SD,SD_307,I,0800-0920,18
2043,DERE-3834 CONCILIACIÓN,24033,202420,RGD,RGD_106-7,V,1400-1520,67
6653,MGPD-4454 ECONOMÍA PARA LA GERENCIA DEL DESARR...,71557,202420,SD,SD_703,J,1100-1350,32


In [6]:
def count_entradas(edificio):
    return talanqueras['Edificio'].str.contains(edificio).sum()

In [7]:
edif = pd.DataFrame()
edif['Edificio'] = courses['Edificio'].unique()
for edificio in edif['Edificio']:
    edif.loc[edif['Edificio']==edificio, 'Entradas'] = count_entradas(edificio)
edif.head()  #  Conteo de entradas por edificio

Unnamed: 0,Edificio,Entradas
0,O,0.0
1,RGD,10088.0
2,ML,35062.0
3,AU,0.0
4,SD,35198.0


In [8]:
# df para contar entradas por rangos de horas
hours_count = pd.DataFrame()
hours_count['Edificio'] = courses['Edificio']
hours_count['Rango de Horas'] = talanqueras['FechaHora'].str.split(' ', expand=True)[1].str.split(':', expand=True)[0].astype(float) 

In [9]:
hours_count.head()

Unnamed: 0,Edificio,Rango de Horas
0,O,7.0
1,O,6.0
2,O,7.0
3,O,12.0
4,O,15.0


In [44]:
import numpy as np

def create_ranges(df):
    earliest = float(hours_count['Rango de Horas'].min())
    latest = float(hours_count['Rango de Horas'].max())
    df['Rango de Horas'] = pd.cut(df['Rango de Horas'], bins=np.arange(earliest, latest, 0.5), right=False)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

create_ranges(hours_count)

Unnamed: 0,Edificio,Rango de Horas
0,RGD,"[7.0, 7.5)"
1,RGD,"[6.0, 6.5)"
2,ML,"[7.0, 7.5)"
3,ML,"[12.0, 12.5)"
4,RGD,"[15.0, 15.5)"
...,...,...
86,ML,"[4.0, 4.5)"
87,LL,"[2.0, 2.5)"
88,RGD,"[2.0, 2.5)"
89,LL,"[1.0, 1.5)"


In [45]:
#count entries by hour
hours_count['Entradas'] = 0
for i in range(len(hours_count)):
    edificio = hours_count['Edificio'].iloc[i]
    left = hours_count['Rango de Horas'].iloc[i].left
    right = hours_count['Rango de Horas'].iloc[i].right
    for j in range(len(talanqueras)):
        hora = int(talanqueras['FechaHora'].iloc[j].split(' ')[1].split(':')[0])
        building = talanqueras['Edificio'].iloc[j]
        if (hora == left or hora == right) and building == edificio:
            hours_count['Entradas'].iloc[i] += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hours_count['Entradas'].iloc[i] += 1


In [46]:
#Ordenar por edificio y rango de horas
hours_count = hours_count.sort_values(by=["Edificio", "Rango de Horas"])

In [47]:
hours_count

Unnamed: 0,Edificio,Rango de Horas,Entradas
90,LL,"[0.0, 0.5)",22
89,LL,"[1.0, 1.5)",34
87,LL,"[2.0, 2.5)",27
68,LL,"[3.0, 3.5)",26
83,LL,"[4.0, 4.5)",25
...,...,...,...
35,SD,"[18.0, 18.5)",1043
50,SD,"[19.0, 19.5)",1087
30,SD,"[20.0, 20.5)",957
16,SD,"[21.0, 21.5)",681


In [48]:
# Exportamos los 2 df a csv
edif.to_csv('../data/entradas_edificio.csv', index=False)
hours_count.to_csv('../data/entradas_edificio_hora.csv', index=False)

## Agrupacion de datos usando oferta de cursos

In [137]:
new = courses.copy()

In [114]:
new.sample(10)

Unnamed: 0,Curso,NRC,Periodo,Edificio,Salón,Días,Horas,Inscritos
184,ADMI-2301 FUNDAMENTOS MERCADEO,39782,202420,SD,SD_715,L,0800-0920,52
2894,DERE-2601 DERECHO INTERNACIONAL,55707,202420,O,O_402,M,0930-1050,48
10013,QUIM-1512 FISICOQUIMICA II,71856,202420,RGD,RGD_203,V,0930-1050,23
5419,IELE-1400 FUNDAMENTOS DE REDES,27889,202420,LL,LL_206,V,0930-1050,19
5770,IIND-3113 SIMULACION EVENTOS DISCRETOS,38953,202420,R,R_209,J,1400-1520,35
284,ADMI-2606 PROY. APLICADO ORGANIZACIONES (CICLO...,72799,202420,AU,AU_306,V,0800-0920,25
8063,MATE-1201 PRECÁLCULO,66931,202420,O,O_405,M,1100-1220,29
3588,ECON-2112 MICROECONOMÍA 2 CON TALLER,55297,202420,ML,ML_617,M,0930-1050,7
9709,MUSI-2351 COMPOSICIÓN PARA CINE EXPERIMENTAL,73502,202420,,,,,1
5694,IIND-2202 FUNDAMENTOS DE PRODUCCION,10500,202420,ML,ML_512,M,1400-1520,44


In [138]:
new.drop(columns=['Periodo','Salón'], inplace=True)
new.dropna(inplace=True)

In [139]:
#Se filtran los cursos que tienen clase los miercoles (I)
new = new[new['Días'].str.contains('I')]

In [140]:
new.shape

(1837, 6)

In [141]:
new.sort_values(by=['Edificio','Horas'], inplace=True)
new.drop_duplicates( keep='first', inplace=True)
new.reset_index(drop=True, inplace=True)

In [142]:
def convert_to_range(horas):
    horas = str(horas)
    # Split the start and end times
    start, end = horas.split("-")

    start_hour = int(start[:2])  # First two digits (hours)
    end_hour = int(end[:2])      # First two digits (hours)
    if end[2:] != '00':
        end_hour += 1

    return[start_hour, end_hour]

# Apply the conversion function
new["Rango de Horas"] = new["Horas"].apply(convert_to_range)

In [143]:
new.drop(columns=['Horas'], inplace=True)
new['Rango de Horas'] = new['Rango de Horas'].apply(tuple)

In [144]:
#Nos da el numero de estudiantes inscritos en un curso por edificio
inscritos = new.groupby('Edificio', as_index=False)['Inscritos'].sum()

In [145]:
new

Unnamed: 0,Curso,NRC,Edificio,Días,Inscritos,Rango de Horas
0,DERE-2207 ARGUMENTACION EN PROCESOS CIVILES,68762,,I,20,"(9, 11)"
1,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(6, 8)"
2,ISIS-1404 TI EN LAS ORGANIZACIONES (TIPO E),12125,AU,I,22,"(6, 8)"
3,ISIS-2503 ARQUITECTURA Y DISEÑO DE SOFTWARE,63026,AU,I,31,"(6, 8)"
4,LENG-1157 ENGLISH 07 - SPEAKING 1 (CICLO 1 DE ...,27929,AU,I,21,"(6, 8)"
...,...,...,...,...,...,...
1673,CPER-2401 NARRATIVA CORPORAL 1 (CICLO 1 DE 8 S...,57272,Z,I,10,"(12, 14)"
1674,IBIO-1110 PROGRAMA DE ACOMPAÑAMIENTO,42304,Z,I,25,"(12, 14)"
1675,GEOC-3606 GEODINÁMICA,62646,Z,I,12,"(14, 16)"
1676,ICYA-1125L LABORATORIO DE GEOMÁTICA,13126,Z,I,25,"(15, 17)"


In [146]:
# Vamos a generar todos los rangos de horas posibles por periodos de media hora
all_ranges = []
earliest = new['Rango de Horas'].apply(lambda x: x[0]).min()
latest = new['Rango de Horas'].apply(lambda x: x[1]).max()
current = earliest
while current < latest:
    all_ranges.append((current, current + 0.5))
    current += 0.5

In [147]:
def count_missing_ranges(start, end, ranges):
    # Count the number of missing ranges
    count = 0
    start_index = 0
    end_index = 0
    for r in ranges:
        if r[0] == start:
            start_index = ranges.index(r)
        if r[1] == end:
            end_index = ranges.index(r)
    count = end_index - start_index + 1
    return (count, start_index, end_index)

def create_row(df, row, start_index, end_index, ranges):
    # Create new rows for the missing ranges and add them to a list
    new_rows = []
    for i in range(start_index, end_index + 1):
        new_row = row.copy()
        new_row['Rango de Horas'] = ranges[i]
        new_row['Inscritos'] = row["Inscritos"]  
        new_rows.append(new_row)  # Append the row to the list
    
    # Concatenate the new rows to the DataFrame at once
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    return df
    

In [148]:
#Se ajustan los rangos de horas con respecto a los que se acaban de generar
for row in new.itertuples():
    start = row[6][0]  # Extract start time from 'Rango de Horas'
    end = row[6][1]    # Extract end time from 'Rango de Horas'
    
    count, start_index, end_index = count_missing_ranges(start, end, all_ranges)
    # If there are missing ranges, add new rows
    if count > 1:
        new = create_row(new, row._asdict(), start_index, end_index, all_ranges)

In [149]:
new.groupby('Edificio', as_index=False)['Inscritos'].sum()

Unnamed: 0,Edificio,Inscritos
0,,100
1,AU,15839
2,B,10003
3,C,21729
4,CJ,315
5,G,1355
6,GA,2676
7,IP,10
8,J,1770
9,K2,293


In [154]:
#Se borran las columnas que no se necesitan y se filtran los cursos que tienen un rango de horas por cada media hora
#new.drop(columns=['Index','_6'], inplace=True)
filtered_df = new[new['Rango de Horas'].apply(lambda x: isinstance(x[0], float) and isinstance(x[1], float))]
lista_edif = ['SD','RGD','ML','LL','O','C','AU','GA','TX']
filtered_df = filtered_df[filtered_df['Edificio'].isin(lista_edif)]
filtered_df.reset_index(drop=True, inplace=True)

In [155]:
filtered_df

Unnamed: 0,Curso,NRC,Edificio,Días,Inscritos,Rango de Horas,Index,_6
0,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(6.5, 7.0)",1.0,"(6, 8)"
1,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(7.0, 7.5)",1.0,"(6, 8)"
2,DERE-2206 PROPIEDAD Y DERECHOS REALES,20376,AU,I,37,"(7.5, 8.0)",1.0,"(6, 8)"
3,ISIS-1404 TI EN LAS ORGANIZACIONES (TIPO E),12125,AU,I,22,"(6.5, 7.0)",2.0,"(6, 8)"
4,ISIS-1404 TI EN LAS ORGANIZACIONES (TIPO E),12125,AU,I,22,"(7.0, 7.5)",2.0,"(6, 8)"
...,...,...,...,...,...,...,...,...
5377,MART-4004 CONCEPTOS Y HERRAMIENTAS DE ARTES PL...,71430,TX,I,5,"(18.5, 19.0)",1578.0,"(18, 21)"
5378,MART-4004 CONCEPTOS Y HERRAMIENTAS DE ARTES PL...,71430,TX,I,5,"(19.0, 19.5)",1578.0,"(18, 21)"
5379,MART-4004 CONCEPTOS Y HERRAMIENTAS DE ARTES PL...,71430,TX,I,5,"(19.5, 20.0)",1578.0,"(18, 21)"
5380,MART-4004 CONCEPTOS Y HERRAMIENTAS DE ARTES PL...,71430,TX,I,5,"(20.0, 20.5)",1578.0,"(18, 21)"


In [157]:
filtered_df.to_csv('../data/entradas_oferta.csv', index=False)

In [158]:
filtered_df = filtered_df.groupby(['Edificio', 'Rango de Horas'], as_index=False)['Inscritos'].sum()

In [159]:
filtered_df.to_csv('../data/entradas_edificio_hora_completos.csv', index=False)

## Para llenar los rangos faltantes

In [84]:
inscritos.to_csv('../data/inscritos_edificio.csv')

In [63]:
import pandas as pd
import numpy as np


df = pd.read_csv('../data/entradas_edificio_hora.csv')

# Convert "Rango de Horas" to numeric ranges for easier processing
def parse_range(range_str):
    range_str = range_str.replace("(", "[").replace(")", "]").strip()
    start, end = map(float, range_str.strip("[]").split(", "))
    return start, end

df[["Start", "End"]] = df["Rango de Horas"].apply(parse_range).apply(pd.Series)

# Initialize an empty DataFrame to store results
all_ranges = []

# Process each unique "Edificio" separately
for edificio in df["Edificio"].unique():
    # Filter data for the current building
    edificio_df = df[df["Edificio"] == edificio]

    # Generate a complete range of intervals for this building
    start_min = edificio_df["Start"].min()
    end_max = edificio_df["End"].max()
    interval = 0.5
    complete_ranges = pd.DataFrame({
        "Start": np.arange(start_min, end_max, interval),
        "End": np.arange(start_min + interval, end_max + interval, interval),
        "Edificio": edificio  # Add the building identifier
    })

    # Merge with the filtered DataFrame
    merged = complete_ranges.merge(edificio_df, on=["Start", "End"], how="left")

    # Drop the redundant "Edificio_x" or "Edificio_y" columns and keep the correct one
    merged["Edificio"] = merged["Edificio_x"].fillna(merged["Edificio_y"])  # Select correct column
    merged.drop(columns=["Edificio_x", "Edificio_y"], inplace=True)  # Drop the redundant columns

    # Fill missing "Entradas" with 0
    merged["Entradas"].fillna(0, inplace=True)

    # Add to the results
    all_ranges.append(merged)

# Combine results from all buildings
final_df = pd.concat(all_ranges, ignore_index=True)

# Recreate "Rango de Horas" column for consistency
final_df["Rango de Horas"] = "[" + final_df["Start"].astype(str) + ", " + final_df["End"].astype(str) + "]"

# Drop temporary columns
final_df = final_df[["Edificio", "Rango de Horas", "Entradas"]]

# Sort and display the result
final_df = final_df.sort_values(by=["Edificio", "Rango de Horas"])
final_df.reset_index(drop=True, inplace=True)

In [94]:
for edificio in final_df['Edificio'].unique():
    # Filter rows where 'Edificio' matches and 'Entradas' is 0
    condition = (final_df['Edificio'] == edificio) & (final_df['Entradas'] == 0)
    
    #sacamos el indice que contiene un 0
    index = final_df.loc[condition].index
    
    for idx in index:
        if idx > 0 and idx < len(final_df) - 1:
            anterior = final_df.loc[idx - 1, 'Entradas']
            posterior = final_df.loc[idx + 1, 'Entradas']
            # Compute the mean for the current 'Edificio', ignoring zeros
            mean_value = np.mean([anterior, posterior])
            # Assign the mean value to rows where 'Entradas' is 0
            final_df.loc[idx, 'Entradas'] = mean_value


In [98]:
final_df

Unnamed: 0,Edificio,Rango de Horas,Entradas
0,LL,"[0.0, 0.5]",22.0
1,LL,"[0.5, 1.0]",0.0
2,LL,"[1.0, 1.5]",34.0
3,LL,"[1.5, 2.0]",0.0
4,LL,"[10.0, 10.5]",669.0
...,...,...,...
175,SD,"[7.5, 8.0]",0.0
176,SD,"[8.0, 8.5]",2327.0
177,SD,"[8.5, 9.0]",0.0
178,SD,"[9.0, 9.5]",2442.0


In [96]:
final_df.to_csv('../data/entradas_edificio_hora_completos.csv', index=False)