# Part 2. Dataframe preparation

Once we have downloaded all the datasets we will be using in our project, the first thing we should do is preparing the dataframe we will be working on.

The main goal of this notebook is to download the bridge datasets for the years previous to 2018, prepare the data using new features and merge this new input data into the 2018's dataset we have created in the previous dataset.

In [1]:
# Importing modules
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
rating_cols = ['STATE_CODE_001','STRUCTURE_NUMBER_008','DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060'] 

In [6]:
# Function for new variable: Total Rating (TR)

def TotalRating(row):
    dr = row['DECK_COND_058']
    supr = row['SUPERSTRUCTURE_COND_059']
    subr = row['SUBSTRUCTURE_COND_060']
    minval = min(dr, supr, subr)
    maxval = max(dr, supr, subr)
   
    if minval <= 4:
        rating = minval
        
    elif minval >= 8:
        rating = (dr+supr+subr)/3
        
    else:
        medval = dr+subr+supr-minval-maxval
        rating = 0.5*minval+0.2*maxval+0.3*medval

    return rating


In [6]:
filenames = []
for year in range(2008,2018):
    year = str(year)
    path = "./data/"+year+"hwybronlyonefile.zip"
    filenames.append(path)

In [7]:
filenames

['./data/2008hwybronlyonefile.zip',
 './data/2009hwybronlyonefile.zip',
 './data/2010hwybronlyonefile.zip',
 './data/2011hwybronlyonefile.zip',
 './data/2012hwybronlyonefile.zip',
 './data/2013hwybronlyonefile.zip',
 './data/2014hwybronlyonefile.zip',
 './data/2015hwybronlyonefile.zip',
 './data/2016hwybronlyonefile.zip',
 './data/2017hwybronlyonefile.zip']

In [7]:
def createTR (filename,colname):
        
    # Create dataframes
    df = pd.read_csv(filename, compression = 'zip',sep = ',',encoding='latin-1',usecols = rating_cols)
    
    # Remove 'N'
    df = df[(df['DECK_COND_058']!='N') & (df['SUPERSTRUCTURE_COND_059']!='N') & (df['SUBSTRUCTURE_COND_060']!='N') ]
    
    # To numeric
    df['DECK_COND_058'] = pd.to_numeric(df['DECK_COND_058'])
    df['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(df['SUPERSTRUCTURE_COND_059'])
    df['SUBSTRUCTURE_COND_060'] = pd.to_numeric(df['SUBSTRUCTURE_COND_060'])
    
    # Remove nulls
    df.dropna(inplace=True)
    
    # Creamos la columna con el Total Rating para cada año
    df[colname] = df.apply(lambda row: TotalRating(row),axis=1)
    
    return df
    

In [19]:
df17 = createTR('./data/2017hwybronlyonefile.zip','TR17')

  if (yield from self.run_code(code, result)):


In [20]:
df17.to_csv (r'./df17.csv', index = None, header=True)

In [21]:
df16 = createTR('./data/2016hwybronlyonefile.zip','TR16')

In [22]:
df16.to_csv (r'./df16.csv', index = None, header=True)

In [23]:
df15 = createTR('./data/2015hwybronlyonefile.zip','TR15')
df15.to_csv (r'./df15.csv', index = None, header=True)

In [24]:
df14 = createTR('./data/2014hwybronlyonefile.zip','TR14')
df14.to_csv (r'./df14.csv', index = None, header=True)

In [25]:
df13 = createTR('./data/2013hwybronlyonefile.zip','TR13')
df13.to_csv (r'./df13.csv', index = None, header=True)

In [26]:
df12 = createTR('./data/2012hwybronlyonefile.zip','TR12')
df12.to_csv (r'./df12.csv', index = None, header=True)

In [27]:
df11 = createTR('./data/2011hwybronlyonefile.zip','TR11')
df11.to_csv (r'./df11.csv', index = None, header=True)

In [28]:
df10 = createTR('./data/2010hwybronlyonefile.zip','TR10')
df10.to_csv (r'./df10.csv', index = None, header=True)

In [59]:
df09 = createTR('./data/2009hwybronlyonefile.zip','TR09')
df09.to_csv (r'./df09.csv', index = None, header=True)

  if (yield from self.run_code(code, result)):


In [60]:
df08 = createTR('./data/2008hwybronlyonefile.zip','TR08')
df08.to_csv (r'./df08.csv', index = None, header=True)

# DESCARGAR LOS CSV A PARTIR DE AQUI

In [2]:
df17 = pd.read_csv('df17.csv')
df16 = pd.read_csv('df16.csv')
df15 = pd.read_csv('df15.csv')
df14 = pd.read_csv('df14.csv')
df13 = pd.read_csv('df13.csv')
df12 = pd.read_csv('df12.csv')
df11 = pd.read_csv('df11.csv')
df10 = pd.read_csv('df10.csv')
df09 = pd.read_csv('df09.csv')
df08 = pd.read_csv('df08.csv')

In [3]:
# Vamos a leer el dataset del 2018 para hacer el join
total18_fil = pd.read_csv('total18_fil.csv')

In [8]:
df18 = total18_fil
df18['TR18'] = df18.apply(lambda row: TotalRating(row),axis=1)

In [9]:
df18.shape

(166517, 26)

In [10]:
df_list = [df18,df17,df16,df15,df14,df13,df12,df11,df10,df09,df08] 

for df in df_list:
    df.drop(['DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060'], axis=1, inplace = True)

In [11]:
df_all = df18

for df in df_list[1:]:
    df_all = pd.merge(df_all, df, how='left', on=['STATE_CODE_001','STRUCTURE_NUMBER_008'])

In [12]:
df_all.head(10)

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,TRAFFIC_LANES_ON_028A,MEDIAN_CODE_033,DEGREES_SKEW_034,STRUCTURE_KIND_043A,STRUCTURE_TYPE_043B,MAIN_UNIT_SPANS_045,MAX_SPAN_LEN_MT_048,STRUCTURE_LEN_MT_049,...,TR17,TR16,TR15,TR14,TR13,TR12,TR11,TR10,TR09,TR08
0,1,13771700120,2,0,0,steel,2,1,17.7,18.9,...,6.5,7.0,7.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
1,1,40,4,2,0,steel,2,1,8.2,8.2,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,1,50,2,0,20,steel,2,1,9.4,9.4,...,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5
3,1,60,3,0,0,concrete,2,3,10.4,34.7,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
4,1,83,1,0,20,steel,2,2,3.7,7.3,...,6.2,6.2,6.2,6.2,6.2,4.0,4.0,4.0,4.0,4.0
5,1,85,1,0,0,steel,2,8,4.0,18.6,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
6,1,136,1,0,0,steel,2,5,6.0,30.3,...,4.0,4.0,5.2,5.2,5.2,5.2,5.2,6.0,6.0,6.0
7,1,138,1,0,0,steel,2,1,12.2,12.2,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
8,1,147,1,0,0,steel,2,1,6.1,7.0,...,5.2,5.2,5.2,5.2,6.0,6.0,6.0,6.0,6.0,6.0
9,1,163,1,0,0,steel,2,1,12.2,12.2,...,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5


In [13]:
df_all.shape

(167824, 33)

In [14]:
# La razón por la que hemos ampliado el dataset a todos los puentes es porque en Pennsylvania, 
# donde inicialmente lo ibamos a hacer, cuando hacemos el join nos quedamos con 5 puentes con rating
# válido de 2010 hacia atrás.

df_all[(df_all['STATE_CODE_001'] == 42)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3912 entries, 121633 to 125544
Data columns (total 33 columns):
STATE_CODE_001            3912 non-null int64
STRUCTURE_NUMBER_008      3912 non-null object
TRAFFIC_LANES_ON_028A     3912 non-null int64
MEDIAN_CODE_033           3912 non-null int64
DEGREES_SKEW_034          3912 non-null int64
STRUCTURE_KIND_043A       3912 non-null object
STRUCTURE_TYPE_043B       3912 non-null int64
MAIN_UNIT_SPANS_045       3912 non-null int64
MAX_SPAN_LEN_MT_048       3912 non-null float64
STRUCTURE_LEN_MT_049      3912 non-null float64
DECK_WIDTH_MT_052         3912 non-null float64
SURFACE_TYPE_108A         3912 non-null object
YEAR_BUILT_027            3912 non-null float64
AGE                       3912 non-null float64
DESIGN_LOAD_031           3912 non-null object
TRUCK_ADT                 3912 non-null float64
MAINTENANCE_021           3912 non-null float64
FUNCTIONAL_CLASS_026      3912 non-null object
WATERWAY_EVAL_071         3912 non-null

In [18]:
# Si consideramos todo el dataset vamos perdiendo datos pero seguimos con un número considerable de entradas

df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169366 entries, 0 to 169365
Data columns (total 33 columns):
STATE_CODE_001            169366 non-null int64
STRUCTURE_NUMBER_008      169366 non-null object
LAT_016                   169366 non-null float64
LONG_017                  169366 non-null float64
TRAFFIC_LANES_ON_028A     169366 non-null int64
MEDIAN_CODE_033           169366 non-null int64
DEGREES_SKEW_034          169366 non-null int64
STRUCTURE_KIND_043A       169366 non-null object
STRUCTURE_TYPE_043B       169366 non-null int64
MAIN_UNIT_SPANS_045       169366 non-null int64
MAX_SPAN_LEN_MT_048       169366 non-null float64
STRUCTURE_LEN_MT_049      169366 non-null float64
DECK_WIDTH_MT_052         169366 non-null float64
SURFACE_TYPE_108A         169366 non-null object
YEAR_BUILT_027            169366 non-null float64
AGE                       169366 non-null float64
DESIGN_LOAD_031           169366 non-null object
TRUCK_ADT                 169366 non-null float64
MAINT

In [15]:
tr_cols = list(df_all.columns)[-11:][::-1]
tr_cols

['TR08',
 'TR09',
 'TR10',
 'TR11',
 'TR12',
 'TR13',
 'TR14',
 'TR15',
 'TR16',
 'TR17',
 'TR18']

In [16]:
years_rated = len(df_list)
years_rated

11

In [17]:
# Drop all the rows where half of the ratings are missing

df_all.dropna(subset= tr_cols, thresh=(years_rated // 2 + 1), inplace=True)

In [18]:
df_all.shape

(147984, 33)

In [19]:
df_all['TR08'].isna().value_counts()

False    128966
True      19018
Name: TR08, dtype: int64

In [20]:
# For the missing values in the rating columns that are left, we will equal them to the rating of the previous year
# Therefore, we should have all the rating from the first year we are considering

df_all.dropna(subset = [tr_cols[1]], inplace= True)

In [21]:
df_all.shape

(129885, 33)

In [22]:
for n in range(len(tr_cols)-1):
    df_all[tr_cols[n+1]] = df_all.apply(lambda row: 
                                        row[tr_cols[n]] if np.isnan(row[tr_cols[n+1]]) else row[tr_cols[n+1]],
                                        axis=1)
    

In [23]:
df_all.sample(20)

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,TRAFFIC_LANES_ON_028A,MEDIAN_CODE_033,DEGREES_SKEW_034,STRUCTURE_KIND_043A,STRUCTURE_TYPE_043B,MAIN_UNIT_SPANS_045,MAX_SPAN_LEN_MT_048,STRUCTURE_LEN_MT_049,...,TR17,TR16,TR15,TR14,TR13,TR12,TR11,TR10,TR09,TR08
117841,40,254250000000000,2,0,0,steel,2,1,7.3,12.4,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
80086,31,C004803705P,2,0,0,steel,2,5,38.1,115.8,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
77472,30,I00090426+05941,2,0,0,concrete,2,3,15.8,38.1,...,6.0,6.0,5.4,5.4,5.7,5.7,5.4,5.4,5.4,5.4
130361,47,33M36100001,2,0,0,concrete,2,1,11.4,11.9,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
160972,55,B11001800000000,2,0,0,concrete,2,4,20.1,62.9,...,5.5,5.5,5.5,6.0,6.0,6.0,6.0,6.0,6.0,6.2
97883,37,450211,2,0,15,steel,2,3,14.6,30.8,...,7.0,7.0,7.0,7.0,7.0,8.0,8.0,8.0,8.0,8.0
48023,21,048B00119N,2,0,50,steel,2,3,33.5,91.1,...,6.2,6.2,6.2,6.2,6.2,6.5,6.5,6.5,6.5,6.5
9529,6,25C0018,2,0,0,steel,2,1,8.5,9.4,...,6.7,6.7,5.4,5.4,5.7,5.7,5.7,5.7,5.7,5.7
77207,30,I00015362+01851,2,0,0,concrete,2,1,22.9,23.5,...,6.2,6.2,6.2,6.2,6.2,6.4,6.4,6.4,6.4,7.2
164183,55,B62098700000000,2,0,0,steel,2,1,7.3,7.5,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.7,5.7,5.7


In [24]:
# Function that creates the deterioration rate, which is the mean of the differences between ratings every year

def det_formula(row):
    
    m=0
    cont_rehab = 0

    for i in range(1,len(tr_cols)):
        dif = row[tr_cols[i-1]]-row[tr_cols[i]]
        
        if dif >= 0: 
            m += dif
            
        else:
            dif = 0
            cont_rehab+=1
            m += dif
            
    return m/(i-cont_rehab)

In [25]:
df_all['DETERIORATION_RATE'] = df_all.apply(lambda row: det_formula(row), axis=1)

In [26]:
# A deterioration rate of 0 seems unreasonable in a 10 years gap.
(df_all['DETERIORATION_RATE']==0).value_counts()

False    91648
True     38237
Name: DETERIORATION_RATE, dtype: int64

In [27]:
# There are 37620 bridges that seem to have no deterioration in 10 years.
# We will drop out those bridges, that might have had false input data from the inspection engineer
df_all = df_all.loc[df_all['DETERIORATION_RATE'] > 0]

In [28]:
# Final dataset
df_all.shape

(91648, 34)

In [29]:
# Save to csv

df_all.to_csv (r'./df_all.csv', index = None, header=True)