### Importing packages

In [149]:
# The script starts by importing the required packages, including pandas, numpy, datetime, and matplotlib.pyplot.
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt


### Loading the data

In [150]:
# The script loads the "Contratos_IRC_2018-2021.csv" file into a pandas DataFrame named "data_IRC".
data_IRC = pd.read_csv("Contratos_IRC_2018-2021.csv")


In [151]:
#msno.matrix(data_IRC)
#plt.figure(figsize = (15,15))
#plt.show()

### Dropping irrelevant columns

In [152]:
# Several columns are dropped from the "data_IRC" DataFrame using the drop() function. These columns have more than 70% missing data or do not provide relevant information.
data_IRC.drop(['Participantes',
           'Ausencia.Competencia',
           'Diferencia.creación.contrato',
           'Reciente.creación',
           'exclusivo_mipymes_no_respetado',
           'Fecha.de.fallo',
           'Plazos.Cortos',
           'Rebasa.monto',
           'IR.sin.documento',
           'LP.sin.documento',
           'Nombre.de.la.UC',
           'Número.del.procedimiento',
           'Código.del.contrato',
           'Título.del.contrato',
           'Descripción.del.contrato',
           'Dirección.del.anuncio',
           'Contrato.marco',
           'Fecha.de.publicación',
           'Fecha.de.apertura'], axis=1, inplace=True)





### Handling missing values

In [None]:
# Several columns are dropped from the "data_IRC" DataFrame using the drop() function. These columns have more than 70% missing data or do not provide relevant information.
data_IRC.dropna(subset=['Fecha.de.inicio.del.contrato'], inplace=True)
data_IRC.dropna(subset=['Fecha.de.fin.del.contrato'], inplace=True)

### Renaming columns

In [153]:
# The script renames selected columns in the "data_IRC" DataFrame using a dictionary called "new_column_names".
new_column_names = {'Siglas.de.la.Institución': 'DepID',
                     'Proveedor.o.contratista': 'ProvContID',
                       'Año' : 'Year',
                         'Importe.pesos':'Spending'}
data_IRC = data_IRC.rename(columns=new_column_names)

### Dropping duplicates

In [154]:
# Duplicate rows in the "data_IRC" DataFrame are removed using the drop_duplicates() function.
data_IRC.drop_duplicates(inplace=True)
pd.set_option('display.max_columns', None)

### Mapping values

In [155]:
# Several columns in the "data_IRC" DataFrame are mapped to new values using the map() function. The mapping dictionaries are defined as "value_mapping1", "value_mapping2", "value_mapping3", and "value_mapping4".
value_mapping1 = {'Adquisiciones': 'ADQ',
                   'Servicios': 'S',
                     'Obra Pública': 'OP',
                       'Servicios Relacionados con la OP': 'SLAOP',
                         'Arrendamientos': 'AR'}
# Change the names of values using the map() function
data_IRC['Tipo.de.contratación'] = data_IRC['Tipo.de.contratación'].map(value_mapping1)

value_mapping2 = {'ADJUDICACIÓN DIRECTA': 'AD',
                   'LICITACIÓN PÚBLICA': 'LP',
                     'INVITACIÓN A CUANDO MENOS 3 PERSONAS': 'I3P',
                       'OTRAS CONTRATACIONES': 'OTHER',
                         'CONTRATO ENTRE ENTES PUBLICOS': 'CEEP',
                         'PROYECTO DE CONVOCATORIA': 'PC'}
# Change the names of values using the map() function
data_IRC['Tipo.de.procedimiento'] = data_IRC['Tipo.de.procedimiento'].map(value_mapping2)

value_mapping3 = {'Electrónica': 'ELE',
                   'Presencial': 'PRE',
                     'Mixta': 'MIX'}
# Change the names of values using the map() function
data_IRC['Forma.de.participación'] = data_IRC['Forma.de.participación'].map(value_mapping3)

value_mapping4 = {'Micro': 'MIC',
                   'Pequeña': 'PEQ',
                     'No MIPYME': 'NOMIPYME',
                     'Mediana': 'MED'}
# Change the names of values using the map() function
data_IRC['Estratificación.de.la.empresa'] = data_IRC['Estratificación.de.la.empresa'].map(value_mapping4)

value_mapping5 = {'Nacional': 'N',
                   'Internacional bajo TLC': 'ITLC',
                     'Internacional': 'I',
                     'Otro': 'OTHER'}
# Change the names of values using the map() function
data_IRC['Carácter.del.procedimiento'] = data_IRC['Carácter.del.procedimiento'].map(value_mapping5)

In [156]:
# A new column named "Status" is added to the "data_IRC" DataFrame using the apply() function. The column is populated based on the values of the "RFC.69bis" and "Proveedor_Sancionado" columns.
data_IRC['Status'] = data_IRC.apply(lambda row: 1 if row['RFC.69bis'] == 1 or row['Proveedor_Sancionado'] == 1 else 0, axis=1)


### Creating copies of the DataFrame

In [157]:
# Two copies of the "data_IRC" DataFrame named "data_prepro" and "data_redf" are created.
data_prepro = data_IRC.copy()
data_redf = data_IRC.copy()


### Pre-processing

In [158]:
# The "data_prepro" DataFrame undergoes several pre-processing steps, including converting categorical columns into dummy columns using the get_dummies() function, replacing a specific value in a column using the replace() function, and replacing missing values with the number 2.
data_prepro = pd.get_dummies(data=data_prepro, columns=['Carácter.del.procedimiento',
                                          'Tipo.de.contratación',
                                          'Tipo.de.procedimiento',
                                          'Forma.de.participación',
                                          'Estratificación.de.la.empresa',
                                          'Year'], prefix= ['PC',
                                                            'CT',
                                                            'PT',
                                                            'PF',
                                                            'S',
                                                            'Year'])


#We replace the name of a value of specific column
cleanup_CONT_TYPE = {'Tipo.de.contratación' :{'ADQUISICIONES':'Adquisiciones'}}
data_prepro = data_prepro.replace(cleanup_CONT_TYPE)

#Replacing Missing Values with the number 2
data_prepro["Sin.justificación"] = data_prepro["Sin.justificación"].fillna('2')
data_prepro["AD.sin.contrato"] = data_prepro["AD.sin.contrato"].fillna('2')
data_prepro["Publicación.Tardía"] = data_prepro["Publicación.Tardía"].fillna('2')


#W
data_prepro['Fundamento.legal'] = np.where(data_prepro['Fundamento.legal'].isna(), 0, 1)
data_prepro['Folio.en.el.RUPC'] = np.where(data_prepro['Folio.en.el.RUPC'].isna(), 0, 1)
data_prepro['RFC'] = np.where(data_prepro['RFC'].isna(), 0, 1)



In [159]:
# The "data_prepro" DataFrame is filtered to include only selected features by indexing the DataFrame with a list of feature names.
features = ['Fundamento.legal', 'Compra.consolidada', 'Folio.en.el.RUPC',
       'RFC', 'RFC.verificado.en.el.SAT', 'Información_Scrapper',
       'exclusivo_mipymes', 'testigo_social', 'archivo_fallo',
       'archivo_apertura', 'archivo_junta', 'archivo_convocatoria',
       'archivo_contrato', 'Spending', 'Publicación.EDCA', 'Sin.justificación',
       'Publicación.Tardía', 
       'AD.sin.contrato', 'Link.funcional', 'Status', 'PC_I', 'PC_ITLC',
       'PC_N', 'PC_OTHER', 'CT_ADQ', 'CT_AR', 'CT_OP', 'CT_S', 'CT_SLAOP',
       'PT_AD', 'PT_CEEP', 'PT_I3P', 'PT_LP', 'PT_OTHER', 'PT_PC', 'PF_ELE',
       'PF_MIX', 'PF_PRE', 'S_MED', 'S_MIC', 'S_NOMIPYME', 'S_PEQ',
       'Year_2018.0', 'Year_2019.0', 'Year_2020.0', 'Year_2021.0']

data_prepro = data_prepro[features]

# Red Flags

In [160]:
# The columns 'Fecha.de.inicio.del.contrato' and 'Fecha.de.fin.del.contrato' in the 'data_redf' DataFrame are converted to datetime format using the pd.to_datetime() function. The specified format is '%d/%m/%Y'.
data_redf['Fecha.de.inicio.del.contrato'] = pd.to_datetime(data_redf['Fecha.de.inicio.del.contrato'], format='%d/%m/%Y')
# The 'BeginningWeek' column is created by extracting the ISO calendar week from the 'Fecha.de.inicio.del.contrato' column using the dt.isocalendar().week attribute.
data_redf['BeginningWeek'] = data_redf['Fecha.de.inicio.del.contrato'].dt.isocalendar().week
data_redf['Fecha.de.fin.del.contrato'] = pd.to_datetime(data_redf['Fecha.de.fin.del.contrato'], format='%d/%m/%Y')

# The number of weeks the contract lasted is calculated by subtracting the 'Fecha.de.inicio.del.contrato' from the 'Fecha.de.fin.del.contrato' columns. The result is divided by 7 to convert it from days to weeks and stored in the 'EBWeeks' column.
data_redf['EBWeeks'] = (data_redf['Fecha.de.fin.del.contrato'] - data_redf['Fecha.de.inicio.del.contrato']).dt.days // 7
# Zero values in the 'EBWeeks' column are replaced with 1 using the replace() function.
data_redf['EBWeeks'] = data_redf['EBWeeks'].replace(0,1)

# Fraction of single bidder contracts - RAD
#T.AD
# A DataFrame named 'totnum_cont_bytype' is created by grouping the 'data_redf' DataFrame by 'ProvContID' and 'Tipo.de.procedimiento' columns and counting the occurrences of each type.
totnum_cont_bytype = data_redf.groupby('ProvContID')['Tipo.de.procedimiento'].value_counts().unstack(fill_value=0)
# The 'totnum_AD' DataFrame is created by selecting only the 'AD' column from 'totnum_cont_bytype'.
totnum_AD = pd.DataFrame(totnum_cont_bytype['AD'])
totnum_AD.reset_index(inplace=True)
totnum_AD.rename(columns={'AD':'T.AD'}, inplace=True)

#T.Cont
# The 'totnum_AD' DataFrame is then merged with another DataFrame 'totnum_cont_perDep' created by counting the number of contracts per 'ProvContID'.
totnum_cont_perDep = pd.DataFrame(data_redf['ProvContID'].value_counts())
totnum_cont_perDep.reset_index(inplace=True)
totnum_cont_perDep.rename(columns={"count":"T.Cont"}, inplace=True)
totnum_AD = totnum_AD.merge(totnum_cont_perDep, how='left', on='ProvContID')
# The 'RAD' column is created by dividing the 'T.AD' column by the 'T.Cont' column.
totnum_AD['RAD'] = totnum_AD['T.AD']/totnum_AD['T.Cont']
# The 'data_redf' DataFrame is merged with the 'totnum_AD' DataFrame based on 'ProvContID' and 'data_redf' is updated with the merged results.
data_redf = pd.merge(data_redf,totnum_AD, how='left', on=['ProvContID'])


#Exist or does not exist in RUPC
# Duplicate rows with unique combinations of 'Folio.en.el.RUPC' and 'ProvContID' are extracted into the 'unique_values' DataFrame.
unique_values = data_redf.drop_duplicates(subset=['Folio.en.el.RUPC', 'ProvContID'])
unique_values = unique_values[["Folio.en.el.RUPC", "ProvContID"]]
# The 'unique_values' DataFrame is merged with the 'data_redf' DataFrame based on 'ProvContID' and 'Folio.en.el.RUPC'.
data_redf = data_redf.merge(unique_values, how='left', on=['ProvContID','Folio.en.el.RUPC'])
# The 'RUPC' column is created in the 'data_redf' DataFrame, where if 'Folio.en.el.RUPC' is null, the value is set to 0; otherwise, it is set to 1.
data_redf['RUPC'] = np.where(data_redf['Folio.en.el.RUPC'].isnull(), 0, 1)

# Calculate the maximum number of contracts by a buyer (T.Cont.Max)
#Maximum number of Contracts by a buyer (Maximum number of the contracts awarded by a buyer to a supplier maxj (T.Contj ) where j represents all the suppliers contracted by a buyer,and T.Cont th)
# The 'df_counts' DataFrame is created by grouping the 'data_redf' DataFrame by 'Institución' and 'ProvContID' and counting the occurrences of each combination.
df_counts = data_redf.groupby(['Institución', 'ProvContID']).size().reset_index(name='num_contracts')
# The 'max_counts' DataFrame is created by grouping the 'df_counts' DataFrame by 'ProvContID' and selecting the maximum value of 'num_contracts'.
max_counts = pd.DataFrame(df_counts.groupby('ProvContID')['num_contracts'].max().reset_index(name='T.Cont.Max'))
# The 'data_redf' DataFrame is merged with the 'max_counts' DataFrame based on 'ProvContID', and 'data_redf' is updated with the merged results.
data_redf = data_redf.merge(max_counts, how='left', on='ProvContID')

#T.Spending.Max
#Maximum Spending by a buyer (Maximum amount of money spent by a buyer in contracts with a supplier maxj (T.Spendingj ) where j represents all the suppliers contracted by the buyer, and T_Spending the amount of money given to each supplier)
# The 'max_amount' DataFrame is created by grouping the 'data_redf' DataFrame by 'Institución' and 'ProvContID' and selecting the maximum value of the 'Spending' column.
max_amount = data_redf.groupby(['Institución','ProvContID'])['Spending'].max().reset_index(name='T.Spending.Max')
# The 'data_redf' DataFrame is merged with the 'max_amount' DataFrame based on 'Institución' and 'ProvContID', and 'data_redf' is updated with the merged results.
data_redf = data_redf.merge(max_amount, how='left', on=['Institución','ProvContID'])

#Spending by buyer(Amount of money given to each supplier) 
# The 'Spend_by_buyer' DataFrame is created by grouping the 'data_redf' DataFrame by 'ProvContID' and summing the values in the 'Spending' column.
Spend_by_buyer = pd.DataFrame(data_redf.groupby('ProvContID')['Spending'].sum())
# The 'Spending' column in 'Spend_by_buyer' is renamed to 'T.Spending'.
Spend_by_buyer.rename(columns={"Spending":"T.Spending"}, inplace=True)
# The 'data_redf' DataFrame is merged with the 'Spend_by_buyer' DataFrame based on 'ProvContID', and 'data_redf' is updated with the merged results.
data_redf = data_redf.merge(Spend_by_buyer, how='left', on='ProvContID')

#Active Weeks
# The 'Active_Weeks' DataFrame is created by grouping the 'data_redf' DataFrame by 'ProvContID', 'DepID', and 'Year', and summing the values in the 'EBWeeks' column.
Active_Weeks = data_redf.groupby(["ProvContID",'DepID','Year'])['EBWeeks'].sum().reset_index(name='ActiveWeeks')
# The 'data_redf' DataFrame is merged with the 'Active_Weeks' DataFrame based on 'ProvContID', 'DepID', and 'Year', and 'data_redf' is updated with the merged results.
data_redf = pd.merge(data_redf, Active_Weeks, on=['ProvContID','DepID','Year'], how='left')



In [162]:

# The "data_redf" DataFrame undergoes dummy encoding for selected categorical columns using the get_dummies() function.
data_redf = pd.get_dummies(data=data_redf, columns=['Carácter.del.procedimiento',
                                          'Tipo.de.contratación',
                                          'Tipo.de.procedimiento',
                                          'Forma.de.participación',
                                          'Estratificación.de.la.empresa'], prefix= ['PC',
                                                            'CT',
                                                            'PT',
                                                            'PF',
                                                            'S'])

# The 'cleanup_CONT_TYPE' dictionary is defined to map the value 'ADQUISICIONES' in the 'Tipo.de.contratación' column to 'Adquisiciones'.
cleanup_CONT_TYPE = {'Tipo.de.contratación' :{'ADQUISICIONES':'Adquisiciones'}}
# The 'replace()' function is used to replace the values in the 'Tipo.de.contratación' column according to the 'cleanup_CONT_TYPE' dictionary.
data_redf = data_redf.replace(cleanup_CONT_TYPE)


# The fillna() function is used to fill missing values in specific columns of the data_redf DataFrame.
# The missing values in these columns are filled with the value '2'.
data_redf["AD.sin.contrato"] = data_redf["AD.sin.contrato"].fillna('2')
data_redf["Sin.justificación"] = data_redf["Sin.justificación"].fillna('2')
data_redf["Publicación.Tardía"] = data_redf["Publicación.Tardía"].fillna('2')

# The 'Fundamento.legal', 'Folio.en.el.RUPC', and 'RFC' columns are updated to binary values indicating the presence or absence of missing values.
data_redf['Fundamento.legal'] = np.where(data_redf['Fundamento.legal'].isna(), 0, 1)
data_redf['Folio.en.el.RUPC'] = np.where(data_redf['Folio.en.el.RUPC'].isna(), 0, 1)
data_redf['RFC'] = np.where(data_redf['RFC'].isna(), 0, 1)


# New RedFlags


•		Percentage of split contracts per year

In [163]:
#We filter the main database for non-competitive contracts, namely Direct Award and Restricted Invitation.
data_RPS_without_PTLP = data_redf[(data_redf["PT_AD"] == 1) | (data_redf["PT_I3P"] == 1)]

#We create a pivot table to identify the number of contracts that occurred in the same week and year.
data_RPS_without_PTLP_sameWandY = data_RPS_without_PTLP.groupby(['ProvContID', 'DepID', 'Year', 'BeginningWeek'])['BeginningWeek'].count().reset_index(name="Same_BegWeekandYear_Count")

#We filter for suppliers and contractors who have had more than one contracting process in the same week of the same year.
data_RPS_without_PTLP_sameWandY = data_RPS_without_PTLP_sameWandY[data_RPS_without_PTLP_sameWandY["Same_BegWeekandYear_Count"] > 1]

#We select the columns of interest.
data_RPS_without_PTLP_sameWandY = data_RPS_without_PTLP_sameWandY[["ProvContID", "Year", "DepID", "BeginningWeek", "Same_BegWeekandYear_Count"]]

#This table shows the number of contracting processes by suppliers and contractors per year.
ProvContID_tot_perY = data_redf.groupby(["ProvContID", "DepID", "Year"]).count().reset_index()

#We remove the unwanted columns.
ProvContID_tot_perY = ProvContID_tot_perY[['ProvContID', 'DepID', 'Year', 'Status']]

#We rename the "Status" column to "Tot_ProvContID_per_Year" (Total number of contracts by Supplier and Department per Year).
ProvContID_tot_perY = ProvContID_tot_perY.rename(columns={'Status': 'Tot_ProvContID_per_Year'})

#We merge the previous table with the table of contracting processes by suppliers and contractors per year.
merge1 = pd.merge(data_RPS_without_PTLP_sameWandY, ProvContID_tot_perY, on=['ProvContID', 'DepID', 'Year'], how='right')

#We create a column that shows the percentage of contracts that were split in the same week of the same year.
merge1['Perc_Split_Contracts'] = merge1['Same_BegWeekandYear_Count'] / merge1['Tot_ProvContID_per_Year']

#We fill the null values with 0.
merge1['Perc_Split_Contracts'].fillna(0, inplace=True)

#We remove the unwanted columns.
merge1 = merge1[['ProvContID', 'DepID', 'Year', 'Perc_Split_Contracts']]

#We remove duplicates.
merge1 = merge1.drop_duplicates(['ProvContID', 'DepID', 'Year'])

#We merge the previous table with the main table.
data_redf = pd.merge(data_redf, merge1, on=['ProvContID', 'DepID', 'Year'], how='left')
data_redf.shape


(667316, 63)


•	Percentage of non-open procedures by government agency

In [164]:
#We filter the data that is not PT.AD (non-Direct Award).
data_PT_AD_1 = data_redf[data_redf["PT_AD"] == 1]

#We group by DepID and count the number of PT.AD to determine the total number of Direct Awards per institution and year.
data_PT_AD_1 = data_PT_AD_1.groupby(['DepID', 'Year'])['PT_AD'].count().reset_index(name='Total_AD_DepID')

#This table shows the number of tenders per department and year.
DepID_tot = data_redf.groupby(['DepID', 'Year'])['Status'].count().reset_index()

#We rename the "Status" column to "Tot_DepID_per_Year" (Total number of tender processes per department and year).
DepID_tot = DepID_tot.rename(columns={'Status': 'Tot_DepID_per_Year'})

#We merge the two tables.
DepID_AD = pd.merge(data_PT_AD_1, DepID_tot, on=['DepID', 'Year'], how='left')

#We create a new column that indicates the percentage of Direct Awards per department and year.
DepID_AD['F_NonOpen_DepID'] = DepID_AD['Total_AD_DepID'] / DepID_AD['Tot_DepID_per_Year']

#We remove the unwanted columns.
DepID_AD = DepID_AD[['DepID', 'Year', 'F_NonOpen_DepID']]

#We merge the previous table with the original data table.
data_redf = pd.merge(data_redf, DepID_AD, on=['DepID', 'Year'], how='left')

#We replace the NaN values with 0.
data_redf["F_NonOpen_DepID"] = data_redf["F_NonOpen_DepID"].fillna(0)

data_redf.shape

(667316, 64)

•	Percentage of non-open procedures by supplier

In [165]:
# We apply the same PT.AD filter and perform a similar process but with ProvContID.
data_PT_AD_1 = data_redf[data_redf["PT_AD"] == 1]

# We group by ProvContID and count the number of PT.AD to determine the total number of Direct Awards per supplier and year.
ProcContID_PT_AD_Tot = data_PT_AD_1.groupby(['ProvContID', 'Year'])['PT_AD'].count().reset_index(name='Total_AD_ProvContID')

# This table shows the number of tenders per supplier and year.
# We group by ProvContID and Year.
ProvContID_tot = data_redf.groupby(['ProvContID', 'Year']).count().reset_index()

# We remove the unwanted columns.
ProvContID_tot = ProvContID_tot[['ProvContID', 'Year', 'Status']]

# We rename the "Status" column to "Num_ProvContID_per_Year" (Total number of tender processes per supplier and year).
ProvContID_tot = ProvContID_tot.rename(columns={'Status': 'Num_ProvContID_per_Year'})

# We merge the two tables.
data = pd.merge(ProcContID_PT_AD_Tot, ProvContID_tot, on=['ProvContID', 'Year'], how='left')

# We create a new column that indicates the percentage of Direct Awards per supplier and year.
data["F_NonOpen_ProvCont"] = data["Total_AD_ProvContID"] / data["Num_ProvContID_per_Year"]

# We remove the unwanted columns.
data = data[['ProvContID', 'Year', 'F_NonOpen_ProvCont']]

# We merge the previous table with the original data table.
data_redf = pd.merge(data_redf, data, on=['ProvContID', 'Year'], how='left')

data_redf["F_NonOpen_ProvCont"] = data_redf["F_NonOpen_ProvCont"].fillna(0)

data_redf.shape


(667316, 65)

•	Frequency of contracts won: 

In [166]:
# We create a pivot table to determine the number of contracts per department and year.
ProvContID_perYear = data_redf.groupby(['DepID', 'Year', 'ProvContID'])['ProvContID'].count().reset_index(name='Total_ProvContID')

# With this pivot table, we find the maximum number of contracts per department and year.
Max_ProvContID_perYear = ProvContID_perYear.groupby(['DepID', 'Year'])['Total_ProvContID'].agg(max).reset_index(name='Max_Total_ProvContID')

# We merge the two tables.
data2 = pd.merge(ProvContID_perYear, Max_ProvContID_perYear, on=['DepID', 'Year'], how='left')

# We create a new column that shows the standardized frequency of contracts won by company i in time t, awarded by department d.
data2["F_of_Contr_Won_IMCO"] = (data2["Total_ProvContID"] * 100) / data2["Max_Total_ProvContID"]

# We remove the unwanted columns.
data2 = data2[['DepID', 'Year', 'ProvContID', 'F_of_Contr_Won_IMCO']]

# We merge the previous table with the original data table.
data_redf = pd.merge(data_redf, data2, on=['DepID', 'Year', 'ProvContID'], how='left')

data_redf["F_of_Contr_Won_IMCO"] = data_redf["F_of_Contr_Won_IMCO"].fillna(0)

data_redf.shape


(667316, 66)

•	Amount of Contracted by the supplier

In [167]:
# We create a pivot table to determine the total spending per department, company, and year.
Sum_Spend = data_redf.groupby(['DepID', 'Year', 'ProvContID'])['Spending'].sum().reset_index(name='Spending')

# We create a pivot table to determine the maximum spending per department and year.
Sum_Spend_max = Sum_Spend.groupby(['DepID', 'Year'])['Spending'].max().reset_index(name='Max_Spending')

# We merge the two tables.
data3 = pd.merge(Sum_Spend, Sum_Spend_max, on=['DepID', 'Year'], how='left')

# We create a new column that indicates the standardized total amount of contracts for company i in year t, awarded by company d.
data3["Am_Cont_by_Sup"] = (data3["Spending"] * 100) / data3["Max_Spending"]

# We remove the unwanted columns.
data3 = data3[['DepID', 'Year', 'ProvContID', 'Am_Cont_by_Sup']]

# We merge the previous table with the original data table.
data_redf = pd.merge(data_redf, data3, on=['DepID', 'Year', 'ProvContID'], how='left')

data_redf.shape

(667316, 67)

•   Contracts per active weeks

In [168]:
# We calculate the Contracts per Week (CPW) by dividing the total number of contracts (T.Cont) by the number of active weeks (ActiveWeeks).
data_redf['CPW'] = data_redf['T.Cont'] / data_redf['ActiveWeeks']
data_redf.shape

(667316, 68)

•   Spending per active weeks

In [169]:
# We calculate the Spending per Week (SPW) by dividing the total spending (T.Spending) by the number of active weeks (ActiveWeeks).
data_redf['SPW'] = data_redf['T.Spending'] / data_redf['ActiveWeeks']
data_redf.shape

(667316, 69)

In [170]:
# We apply one-hot encoding to the 'Year' column in the data_redf DataFrame.
data_redf = pd.get_dummies(data_redf, columns=['Year'], prefix=['Year'])


In [171]:
# We select a subset of features from the data_redf DataFrame.
features = ['Fundamento.legal', 'Compra.consolidada', 'Folio.en.el.RUPC', 'RFC.verificado.en.el.SAT',
            'exclusivo_mipymes', 'testigo_social', 'archivo_fallo', 'archivo_apertura', 'archivo_junta',
            'archivo_convocatoria', 'archivo_contrato', 'Spending', 'Publicación.EDCA', 'Sin.justificación',
            'Publicación.Tardía', 'AD.sin.contrato', 'Link.funcional', 'Status', 'EBWeeks', 'T.AD', 'T.Cont',
            'RAD', 'RUPC', 'T.Cont.Max', 'T.Spending.Max', 'T.Spending', 'ActiveWeeks', 'PC_I', 'PC_ITLC',
            'PC_N', 'PC_OTHER', 'CT_ADQ', 'CT_AR', 'CT_OP', 'CT_S', 'CT_SLAOP', 'PT_AD', 'PT_CEEP', 'PT_I3P',
            'PT_LP', 'PT_OTHER', 'PT_PC', 'PF_ELE', 'PF_MIX', 'PF_PRE', 'S_MED', 'S_MIC', 'S_NOMIPYME', 'S_PEQ',
            'Perc_Split_Contracts', 'F_NonOpen_DepID', 'F_NonOpen_ProvCont', 'F_of_Contr_Won_IMCO', 'Am_Cont_by_Sup',
            'CPW', 'SPW', 'Year_2018.0', 'Year_2019.0', 'Year_2020.0', 'Year_2021.0']

data_redf = data_redf[features]


In [172]:
print(data_prepro.shape)
print(data_redf.shape)

(667316, 46)
(667316, 60)


In [173]:
print(data_prepro['Status'].value_counts())
print(data_redf['Status'].value_counts())

Status
0    666189
1      1127
Name: count, dtype: int64
Status
0    666189
1      1127
Name: count, dtype: int64


In [None]:
data_redf.to_csv("IMCO_PreRed.csv")
data_prepro.to_csv("IMCO_N.csv")