In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from joblib import dump, load

module_path = os.path.abspath(os.path.join('..', 'code'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import utils

## Procesado de datos Oferta de cursos

In [3]:
df = utils.load_data('../data/courses.csv')

In [4]:
def initial_filter(df):
    df = df[df['Lugar']=='CAMPUS PRINCIPAL']
    # Mantener solo los edificos usados para la investigación
    df = df[df['Edificio'].str.contains(r'.*SD.*|.*\.centro.*|.*ML.*|.*LL.*', na=False)]
    # Eliminar filas duplicadas
    df.drop_duplicates(inplace=True)
    
    return df

In [5]:
def further_processing(df):
    # Mantener solo las columnas necesarias
    df = df[['Curso','NRC','Periodo','Edificio','Salón','Días','Horas', 'Inscritos']]
    # Eliminar cursos sin inscritos
    df = df[df['Inscritos']>0]
    return df

In [6]:
def save_file(processed_df, path):
    processed_df.to_csv(path, index=False)

In [7]:
def process_file(input_path, output_path):
    df = utils.load_data(input_path)
    df = initial_filter(df)
    df = further_processing(df)
    save_file(df, output_path)

In [8]:
dump(process_file, '../models/process_file.joblib')

['../models/process_file.joblib']

In [9]:
process_file_loaded = load('../models/process_file.joblib')

process_file_loaded('../data/courses.csv', '../data/processed_courses.csv')



### Usando los datos de talanqueras

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
courses = pd.read_csv('../data/processed_courses.csv')
talanqueras = pd.read_csv('../data/output.csv')

In [24]:
courses.shape

(4457, 8)

In [4]:
talanqueras['Edificio'] = talanqueras['Puerta'].str.split('-', expand=True)[0]
talanqueras['Edificio'] = talanqueras['Edificio'].str.replace(r'[0-9]', '', regex=True)
talanqueras = talanqueras.drop(talanqueras[talanqueras['Edificio'] == 'CD'].index)

courses['Salón'] = courses['Salón'].str.replace('.','')
courses['Edificio'] = courses['Salón'].str.split('_', expand=True)[0]

In [5]:
courses.sample(10)

Unnamed: 0,Curso,NRC,Periodo,Edificio,Salón,Días,Horas,Inscritos
2794,LENG-1154 ENGLISH 04 (CICLO 2 DE 8 SEMANAS),30633,202420,LL,LL_105,J,1530-1650,8
200,ADMI-3502 ESTRATEGIA,28691,202420,ML,ML_515,M,0930-1050,37
917,DERE-3601 DERECHO INTERNACIONAL 2,63003,202420,ML,ML_615,L,1100-1220,50
4206,MMBA-4300 MERCADEO ESTRATÉGICO (CICLO 2 DE 8 S...,72888,202420,SD,SD_715,V,1800-2050,36
4363,PSIG-2201 NEUROANATOMÍA FUNCIONAL,65686,202420,ML,ML_615,M,0800-0920,76
3001,LENG-1157 ENGLISH 07 - SPEAKING 1 (CICLO 2 DE ...,65918,202420,LL,LL_203,V,1100-1220,21
447,CBCO-1113 ARTE EN COLOMBIA (CICLO 2 DE 8 SEMANAS),63279,202420,RGD,RGD_112-13,J,1400-1550,51
2622,ISIS-2603 DESARROLLO DE SW EN EQUIPO,11183,202420,SD,SD_403,I,0800-0920,23
3292,LENG-1701 PORTUGUES 1,63147,202420,LL,LL_108,L,0800-0920,22
330,CBCA-1022 CIUDAD Y TÉCNICA (CICLO 1 DE 8 SEMANAS),71894,202420,RGD,RGD_05,M,0700-0850,37


In [6]:
def count_entradas(edificio):
    return talanqueras['Edificio'].str.contains(edificio).sum()

In [7]:
edif = pd.DataFrame()
edif['Edificio'] = courses['Edificio'].unique()
for edificio in edif['Edificio']:
    edif.loc[edif['Edificio']==edificio, 'Entradas'] = count_entradas(edificio)
edif.head()  #  Conteo de entradas por edificio

Unnamed: 0,Edificio,Entradas
0,RGD,10088.0
1,ML,35062.0
2,SD,35198.0
3,LL,10014.0


In [None]:
# df para contar entradas por rangos de horas
hours_count = pd.DataFrame()
hours_count['Edificio'] = courses['Edificio']
hours_count['Rango de Horas'] = talanqueras['FechaHora'].str.split(' ', expand=True)[1].str.split(':', expand=True)[0].astype(int) 

In [9]:
hours_count.head()

Unnamed: 0,Edificio,Rango de Horas
0,RGD,
1,RGD,6.0
2,ML,7.0
3,ML,12.0
4,RGD,15.0


In [10]:
def create_ranges(df):
    earliest = int(hours_count['Rango de Horas'].min())
    latest = int(hours_count['Rango de Horas'].max())
    df['Rango de Horas'] = pd.cut(df['Rango de Horas'], bins=range(earliest, latest, 1), right=False)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

create_ranges(hours_count)

Unnamed: 0,Edificio,Rango de Horas
0,RGD,"[6, 7)"
1,ML,"[7, 8)"
2,ML,"[12, 13)"
3,RGD,"[15, 16)"
4,ML,"[15, 16)"
...,...,...
82,ML,"[4, 5)"
83,RGD,"[2, 3)"
84,LL,"[1, 2)"
85,LL,"[2, 3)"


In [12]:
#count entries by hour
hours_count['Entradas'] = 0
for i in range(len(hours_count)):
    edificio = hours_count['Edificio'].iloc[i]
    left = hours_count['Rango de Horas'].iloc[i].left
    right = hours_count['Rango de Horas'].iloc[i].right
    for j in range(len(talanqueras)):
        hora = int(talanqueras['FechaHora'].iloc[j].split(' ')[1].split(':')[0])
        building = talanqueras['Edificio'].iloc[j]
        if (hora == left or hora == right) and building == edificio:
            hours_count['Entradas'].iloc[i] += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hours_count['Entradas'].iloc[i] += 1


In [None]:
#Ordenar por edificio y rango de horas
hours_count = hours_count.sort_values(by=["Edificio", "Rango de Horas"])

In [22]:
hours_count

Unnamed: 0,Edificio,Rango de Horas,Entradas
86,LL,"[0, 1)",56
84,LL,"[1, 2)",61
85,LL,"[2, 3)",53
64,LL,"[3, 4)",51
78,LL,"[4, 5)",697
...,...,...,...
12,SD,"[17, 18)",2035
31,SD,"[18, 19)",2130
45,SD,"[19, 20)",2044
26,SD,"[20, 21)",1638


In [23]:
# Exportamos los 2 df a csv
edif.to_csv('../data/entradas_edificio.csv', index=False)
hours_count.to_csv('../data/entradas_edificio_hora.csv', index=False)

## Merge de los datos

In [25]:
print(courses['Horas'].str.split('-', expand=True)[0].astype(int).min())

630


In [38]:
new = courses.copy()

In [61]:
new

Unnamed: 0,Edificio,Inscritos
0,LL,30
1,LL,33
2,LL,28
3,LL,31
4,LL,15
...,...,...
209,SD,37
210,SD,20
211,SD,43
212,SD,17


In [40]:
new.drop(columns=['Curso','NRC','Periodo','Salón', 'Días'], inplace=True)

In [45]:
new.sort_values(by=['Edificio','Horas'], inplace=True)
new.drop_duplicates(subset=['Edificio','Horas'], keep='first', inplace=True)
new.reset_index(drop=True, inplace=True)

In [None]:
def convert_to_range(horas):
    # Split the start and end times
    start, end = horas.split("-")

    start_hour = int(start[:2])  # First two digits (hours)
    end_hour = int(end[:2])      # First two digits (hours)
    if end[2:] != '00':
        end_hour += 1

    return[start_hour, end_hour]

# Apply the conversion function
new["Rango de Horas"] = new["Horas"].apply(convert_to_range)

In [62]:
new.groupby('Edificio').count()*new['Inscritos'].sum()

Unnamed: 0_level_0,Inscritos
Edificio,Unnamed: 1_level_1
LL,256168
ML,374880
RGD,331144
SD,374880


In [60]:
new.drop(columns=['Horas','Rango de Horas'], inplace=True)