# Dataframe preparation

Once we have downloaded all the datasets we will be using in our project, the first thing we should do is preparing the dataframe we will be working on.

In [1]:
# Importing modules
import pandas as pd
import numpy as np

The downloaded files are in a txt format, with the state name and the corresponding year of the dataset.

In [2]:
# Let's create dataframes from those txt files
pa18 = pd.read_csv('./NBIDATA/PA18.txt')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
pa18.shape

(22737, 137)

In [None]:
# CHECKED that the number of bridges is the same as in the table provided in the FHWA website

In [43]:
states = ['PA']
#,'OH']

years = ['1992','1993','1994','1995','1996','1997','1998','1999',
        '2000','2001','2002','2003','2004','2005','2006','2007','2008','2009',
        '10','11','2012','13','14','15','16','17','18']

In [66]:
link_data = []

for state in states:
    for year in years:
        # The 2016 dataset comes with a uppercase extension
        if year == '16':
            link ='./NBIDATA/'+state+year+'.TXT'
        else:
            link ='./NBIDATA/'+state+year+'.txt'
        link_data.append(link)


In [67]:
link_data

['./NBIDATA/PA1992.txt',
 './NBIDATA/PA1993.txt',
 './NBIDATA/PA1994.txt',
 './NBIDATA/PA1995.txt',
 './NBIDATA/PA1996.txt',
 './NBIDATA/PA1997.txt',
 './NBIDATA/PA1998.txt',
 './NBIDATA/PA1999.txt',
 './NBIDATA/PA2000.txt',
 './NBIDATA/PA2001.txt',
 './NBIDATA/PA2002.txt',
 './NBIDATA/PA2003.txt',
 './NBIDATA/PA2004.txt',
 './NBIDATA/PA2005.txt',
 './NBIDATA/PA2006.txt',
 './NBIDATA/PA2007.txt',
 './NBIDATA/PA2008.txt',
 './NBIDATA/PA2009.txt',
 './NBIDATA/PA10.txt',
 './NBIDATA/PA11.txt',
 './NBIDATA/PA2012.txt',
 './NBIDATA/PA13.txt',
 './NBIDATA/PA14.txt',
 './NBIDATA/PA15.txt',
 './NBIDATA/PA16.TXT',
 './NBIDATA/PA17.txt',
 './NBIDATA/PA18.txt']

In [7]:
# We are only interested in extracting the columns from the older datasets that contain the condition ratings and the structure number
list(pa18.columns)

['STATE_CODE_001',
 'STRUCTURE_NUMBER_008',
 'RECORD_TYPE_005A',
 'ROUTE_PREFIX_005B',
 'SERVICE_LEVEL_005C',
 'ROUTE_NUMBER_005D',
 'DIRECTION_005E',
 'HIGHWAY_DISTRICT_002',
 'COUNTY_CODE_003',
 'PLACE_CODE_004',
 'FEATURES_DESC_006A',
 'CRITICAL_FACILITY_006B',
 'FACILITY_CARRIED_007',
 'LOCATION_009',
 'MIN_VERT_CLR_010',
 'KILOPOINT_011',
 'BASE_HWY_NETWORK_012',
 'LRS_INV_ROUTE_013A',
 'SUBROUTE_NO_013B',
 'LAT_016',
 'LONG_017',
 'DETOUR_KILOS_019',
 'TOLL_020',
 'MAINTENANCE_021',
 'OWNER_022',
 'FUNCTIONAL_CLASS_026',
 'YEAR_BUILT_027',
 'TRAFFIC_LANES_ON_028A',
 'TRAFFIC_LANES_UND_028B',
 'ADT_029',
 'YEAR_ADT_030',
 'DESIGN_LOAD_031',
 'APPR_WIDTH_MT_032',
 'MEDIAN_CODE_033',
 'DEGREES_SKEW_034',
 'STRUCTURE_FLARED_035',
 'RAILINGS_036A',
 'TRANSITIONS_036B',
 'APPR_RAIL_036C',
 'APPR_RAIL_END_036D',
 'HISTORY_037',
 'NAVIGATION_038',
 'NAV_VERT_CLR_MT_039',
 'NAV_HORR_CLR_MT_040',
 'OPEN_CLOSED_POSTED_041',
 'SERVICE_ON_042A',
 'SERVICE_UND_042B',
 'STRUCTURE_KIND_043A',
 '

In [8]:
rating_cols = ['STRUCTURE_NUMBER_008','DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060']

We will have to do the same data preparation for all the datasets we have downloaded from 1992 to 2018 and from PA and OH.
The steps are: 
    - Read only the rating condition and structure number columns 
    - Check missing values and how to deal with them
    - Create a new variable "Total condition rating" that considers the three rating conditions
    
What we want to study is the evolution of this new variable per year on each bridge:
    - Create the ratio per year of this new variable
    - Calculate the mean of this ratio per bridge

#### Read only the rating condition and structure number columns

In [9]:
pa92 = pd.read_csv(link_data[0], header=0, usecols = rating_cols)

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
pa92.shape

(29767, 4)

In [11]:
pa92.head()

Unnamed: 0,STRUCTURE_NUMBER_008,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060
0,10015001019940,7,8,7
1,10015003000000,8,8,7
2,10015003100000,8,7,7
3,10015005000000,8,8,8
4,10015005100000,7,8,7


In [12]:
pa92.dtypes

STRUCTURE_NUMBER_008       object
DECK_COND_058              object
SUPERSTRUCTURE_COND_059    object
SUBSTRUCTURE_COND_060      object
dtype: object

In [13]:
# We also drop all the rows with 'N' values (which mean Not Applicable)
pa92 = pa92[pa92['DECK_COND_058']!='N']
pa92 = pa92[pa92['SUPERSTRUCTURE_COND_059']!='N']
pa92 = pa92[pa92['SUBSTRUCTURE_COND_060']!='N']

In [14]:
pa92.shape

(26510, 4)

In [15]:
# We need to change the object types to numeric
pa92['DECK_COND_058'] = pd.to_numeric(pa92['DECK_COND_058'])
pa92['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(pa92['SUPERSTRUCTURE_COND_059'])
pa92['SUBSTRUCTURE_COND_060'] = pd.to_numeric(pa92['SUBSTRUCTURE_COND_060'])

In [16]:
pa92.dtypes

STRUCTURE_NUMBER_008        object
DECK_COND_058              float64
SUPERSTRUCTURE_COND_059    float64
SUBSTRUCTURE_COND_060      float64
dtype: object

#### Check missing values

In [17]:
# We check how many items are missing in the dataset
pa92.isnull().sum()

STRUCTURE_NUMBER_008          0
DECK_COND_058              6009
SUPERSTRUCTURE_COND_059    6016
SUBSTRUCTURE_COND_060      6011
dtype: int64

In [18]:
# We drop the rows with null values
pa92.dropna(inplace=True)

In [19]:
pa92.shape

(20491, 4)

#### Create a new variable "Total condition rating" that considers the three rating conditions

El dataset contiene los ratings de tres partes fundamentales de los puentes:
    - Deck (Item 58): describes the overall condition rating of the deck (deck slab, parapets and barriers, bearings, ...)
    
    - Superstructure (Item 59): describes the physical condition rating of all the structural members of the superstructure (girders/beams, cross-frames, stiffeners,...)
    
    - Substructure (Item 60): describes the physical condition of piers, abutments, piles, fenders, footings or other components of the bridge substructure.

La nueva variable Total Rating que engloba los 3 ratings que define el dataset se ha establecido tras evaluar las common practices en inspección y rehabilitación de puentes. 

Se han definido 3 casos distintos que podrían presentar los datos:
    - Si cualquiera de los ratings estuviese en "Poor condition", es decir con valores menores o iguales al 4, la rehabilitación debería de estar próxima, por lo que tomamos el Total Rating como el menor de los 3 ratings.
    
    - Si por el contrario todos los ratings están por encima de 8, es decir con "Very good condition" o "Excellent condition", definiremos el Total Rating como la media aritmética de los ratings.
    
    - Como tercera opción, si tenemos valores superiores al 5 pero sin estar todos en las mejores condiciones, definiremos el Total Rating como una media ponderada. Los coeficientes de ponderación que se han establecido son:
            - 0.5 para el menor de los ratings, para dar más peso al elemento más desfavorecido
            - 0.3 para el rating intermedio (prácticamente la misma ponderación que en la opción 2)
            - 0.2 para el mayor de los ratings

In [20]:
def TotalRating(row):
    dr = row['DECK_COND_058']
    supr = row['SUPERSTRUCTURE_COND_059']
    subr = row['SUBSTRUCTURE_COND_060']
    minval = min(dr, supr, subr)
    maxval = max(dr, supr, subr)
   
    if minval <= 4:
        rating = minval
        
    elif minval >= 8:
        rating = (dr+supr+subr)/3
        
    else:
        medval = dr+subr+supr-minval-maxval
        rating = 0.5*minval+0.2*maxval+0.3*medval

    return rating


pa92['TR92'] = pa92.apply(lambda row: TotalRating(row),axis=1)

In [21]:
pa92.sample(10)
# CHECKED: The equation has worked properly 

Unnamed: 0,STRUCTURE_NUMBER_008,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,TR92
12384,357205034501010,3.0,2.0,2.0,2.0
20117,583007014001710,6.0,5.0,7.0,5.7
20140,584005006000000,8.0,8.0,6.0,7.0
18621,532003014021980,3.0,3.0,5.0,3.0
5138,110160039000000,7.0,7.0,5.0,6.0
14563,407304191400120,6.0,5.0,6.0,5.5
7619,197213039200010,7.0,7.0,3.0,3.0
18888,540522004000000,7.0,6.0,6.0,6.2
8878,228019001019820,6.0,5.0,6.0,5.5
11794,330950005000000,5.0,5.0,5.0,5.0


In [None]:
# NO EJECUTAR - SOLO RESUMEN
# Recopilamos todos los pasos para crear la misma columna en todos los datasets desde 1993 a 2017

# Abrimos el archivo como un dataframe que contiene solo las columnas que nos interesan
pa92 = pd.read_csv(link_data[0], header=0, usecols = rating_cols)

# Quitamos los valores 'N' (Not applicable) de las columnas de rating
pa92 = pa92[pa92['DECK_COND_058']!='N']
pa92 = pa92[pa92['SUPERSTRUCTURE_COND_059']!='N']
pa92 = pa92[pa92['SUBSTRUCTURE_COND_060']!='N']

# Pasamos los valores de rating a números
pa92['DECK_COND_058'] = pd.to_numeric(pa92['DECK_COND_058'])
pa92['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(pa92['SUPERSTRUCTURE_COND_059'])
pa92['SUBSTRUCTURE_COND_060'] = pd.to_numeric(pa92['SUBSTRUCTURE_COND_060'])

# Quitamos los nulls
pa92.dropna(inplace=True)

# Creamos la columna con el Total Rating para cada año
pa92['TR92'] = pa92.apply(lambda row: TotalRating(row),axis=1)


In [111]:
def TR_per_year(dfcol,link,i):
    
    dfyear = pd.read_csv(link[i], header=0, usecols = rating_cols)

    # Quitamos los valores 'N' (Not applicable) de las columnas de rating
    dfyear = dfyear[(dfyear['DECK_COND_058']!='N') & (dfyear['SUPERSTRUCTURE_COND_059']!='N') & (dfyear['SUBSTRUCTURE_COND_060']!='N') ]

    # Pasamos los valores de rating a números
    dfyear['DECK_COND_058'] = pd.to_numeric(dfyear['DECK_COND_058'])
    dfyear['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(dfyear['SUPERSTRUCTURE_COND_059'])
    dfyear['SUBSTRUCTURE_COND_060'] = pd.to_numeric(dfyear['SUBSTRUCTURE_COND_060'])
    
    # Quitamos los nulls
    dfyear.dropna(inplace=True)

    # Creamos la columna con el Total Rating para cada año
    dfyear[dfcol] = dfyear.apply(lambda row: TotalRating(row),axis=1)

    return dfyear

In [116]:
# Creamos todos los dataframes
pa92 = TR_per_year('TR92',link_data,0)
pa93 = TR_per_year('TR93',link_data,1)
pa94 = TR_per_year('TR94',link_data,2)
pa95 = TR_per_year('TR95',link_data,3)
pa96 = TR_per_year('TR96',link_data,4)
pa97 = TR_per_year('TR97',link_data,5)
pa98 = TR_per_year('TR98',link_data,6)
pa99 = TR_per_year('TR99',link_data,7)
pa00 = TR_per_year('TR00',link_data,8)
pa01 = TR_per_year('TR01',link_data,9)
pa02 = TR_per_year('TR02',link_data,10)
pa03 = TR_per_year('TR03',link_data,11)
pa04 = TR_per_year('TR04',link_data,12)
pa05 = TR_per_year('TR05',link_data,13)
pa06 = TR_per_year('TR06',link_data,14)
pa07 = TR_per_year('TR07',link_data,15)
pa08 = TR_per_year('TR08',link_data,16)
pa09 = TR_per_year('TR09',link_data,17)
pa10 = TR_per_year('TR10',link_data,18)
pa11 = TR_per_year('TR11',link_data,19)
pa12 = TR_per_year('TR12',link_data,20)
pa13 = TR_per_year('TR13',link_data,21)
pa14 = TR_per_year('TR14',link_data,22)
pa15 = TR_per_year('TR15',link_data,23)
pa16 = TR_per_year('TR16',link_data,24)
pa17 = TR_per_year('TR17',link_data,25)

  if (yield from self.run_code(code, result)):
  if (yield from self.run_code(code, result)):
  if (yield from self.run_code(code, result)):


In [120]:
pa05.head(8)

Unnamed: 0,STRUCTURE_NUMBER_008,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,TR05
1,1PA0220,7.0,6.0,7.0,6.5
2,010015003000000,7.0,7.0,7.0,7.0
3,010015003100000,6.0,6.0,6.0,6.0
4,010015005000000,7.0,8.0,7.0,7.2
5,010015005100000,5.0,8.0,6.0,5.9
7,010015015000000,7.0,8.0,7.0,7.2
8,010015015100000,6.0,7.0,7.0,6.5
9,010015019000000,7.0,8.0,7.0,7.2


In [121]:
pa05.dtypes

STRUCTURE_NUMBER_008        object
DECK_COND_058              float64
SUPERSTRUCTURE_COND_059    float64
SUBSTRUCTURE_COND_060      float64
TR05                       float64
dtype: object

### OJO Mirar bien cada dataset por separado porque parece que no esta cogiendo bien el structure number