# Dataframe preparation

Once we have downloaded all the datasets we will be using in our project, the first thing we should do is preparing the dataframe we will be working on.

In [1]:
# Importing modules
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
rating_cols = ['STATE_CODE_001','STRUCTURE_NUMBER_008','DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060'] 

df17 = pd.read_csv('./data/2017hwybronlyonefile.zip',compression = 'zip',sep = ',',encoding='latin-1',usecols = rating_cols)

df17.head()

In [6]:
# Function for new variable: Total Rating (TR)

def TotalRating(row):
    dr = row['DECK_COND_058']
    supr = row['SUPERSTRUCTURE_COND_059']
    subr = row['SUBSTRUCTURE_COND_060']
    minval = min(dr, supr, subr)
    maxval = max(dr, supr, subr)
   
    if minval <= 4:
        rating = minval
        
    elif minval >= 8:
        rating = (dr+supr+subr)/3
        
    else:
        medval = dr+subr+supr-minval-maxval
        rating = 0.5*minval+0.2*maxval+0.3*medval

    return rating


In [7]:
filenames = []
for year in range(2011,2018):
    year = str(year)
    path = "./data/"+year+"hwybronlyonefile.zip"
    filenames.append(path)

df17 = df17[(df17['DECK_COND_058']!='N') & (df17['SUPERSTRUCTURE_COND_059']!='N') & (df17['SUBSTRUCTURE_COND_060']!='N') ]

# Pasamos los valores de rating a números
df17['DECK_COND_058'] = pd.to_numeric(df17['DECK_COND_058'])
df17['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(df17['SUPERSTRUCTURE_COND_059'])
df17['SUBSTRUCTURE_COND_060'] = pd.to_numeric(df17['SUBSTRUCTURE_COND_060'])

# Quitamos los nulls
df17.dropna(inplace=True)

# Creamos la columna con el Total Rating para cada año
df17['TR17'] = df17.apply(lambda row: TotalRating(row),axis=1)

df17.head(10)

df17.drop(['DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060'], axis=1, inplace = True)

df17.head()

# Vamos a leer el dataset del 2018 para hacer el join
total18_fil = pd.read_csv('total18_fil.csv')

df18 = total18_fil
df18['TR18'] = df18.apply(lambda row: TotalRating(row),axis=1)

df18.shape

# Since we are looking for joining all the datasets with this one by using the Structure number, we should first check that there are no duplicates on it
df18['STRUCTURE_NUMBER_008'].duplicated().value_counts()

duplicateRowsDF = df18[df18.duplicated(['STRUCTURE_NUMBER_008'])]
duplicateRowsDF['STATE_CODE_001'].value_counts()

df17.shape

df2 = pd.merge(df18, df17, how= 'left', on = ['STATE_CODE_001','STRUCTURE_NUMBER_008'])

df2.shape

In [57]:
filenames = []
for year in range(2008,2018):
    year = str(year)
    path = "./data/"+year+"hwybronlyonefile.zip"
    filenames.append(path)

In [58]:
filenames

['./data/2008hwybronlyonefile.zip',
 './data/2009hwybronlyonefile.zip',
 './data/2010hwybronlyonefile.zip',
 './data/2011hwybronlyonefile.zip',
 './data/2012hwybronlyonefile.zip',
 './data/2013hwybronlyonefile.zip',
 './data/2014hwybronlyonefile.zip',
 './data/2015hwybronlyonefile.zip',
 './data/2016hwybronlyonefile.zip',
 './data/2017hwybronlyonefile.zip']

In [18]:
def createTR (filename,colname):
        
    # Create dataframes
    df = pd.read_csv(filename, compression = 'zip',sep = ',',encoding='latin-1',usecols = rating_cols)
    
    # Remove 'N'
    df = df[(df['DECK_COND_058']!='N') & (df['SUPERSTRUCTURE_COND_059']!='N') & (df['SUBSTRUCTURE_COND_060']!='N') ]
    
    # To numeric
    df['DECK_COND_058'] = pd.to_numeric(df['DECK_COND_058'])
    df['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(df['SUPERSTRUCTURE_COND_059'])
    df['SUBSTRUCTURE_COND_060'] = pd.to_numeric(df['SUBSTRUCTURE_COND_060'])
    
    # Remove nulls
    df.dropna(inplace=True)
    
    # Creamos la columna con el Total Rating para cada año
    df[colname] = df.apply(lambda row: TotalRating(row),axis=1)
    
    return df
    

In [19]:
df17 = createTR('./data/2017hwybronlyonefile.zip','TR17')

  if (yield from self.run_code(code, result)):


In [20]:
df17.to_csv (r'./df17.csv', index = None, header=True)

In [21]:
df16 = createTR('./data/2016hwybronlyonefile.zip','TR16')

In [22]:
df16.to_csv (r'./df16.csv', index = None, header=True)

In [23]:
df15 = createTR('./data/2015hwybronlyonefile.zip','TR15')
df15.to_csv (r'./df15.csv', index = None, header=True)

In [24]:
df14 = createTR('./data/2014hwybronlyonefile.zip','TR14')
df14.to_csv (r'./df14.csv', index = None, header=True)

In [25]:
df13 = createTR('./data/2013hwybronlyonefile.zip','TR13')
df13.to_csv (r'./df13.csv', index = None, header=True)

In [26]:
df12 = createTR('./data/2012hwybronlyonefile.zip','TR12')
df12.to_csv (r'./df12.csv', index = None, header=True)

In [27]:
df11 = createTR('./data/2011hwybronlyonefile.zip','TR11')
df11.to_csv (r'./df11.csv', index = None, header=True)

In [28]:
df10 = createTR('./data/2010hwybronlyonefile.zip','TR10')
df10.to_csv (r'./df10.csv', index = None, header=True)

In [59]:
df09 = createTR('./data/2009hwybronlyonefile.zip','TR09')
df09.to_csv (r'./df09.csv', index = None, header=True)

  if (yield from self.run_code(code, result)):


In [60]:
df08 = createTR('./data/2008hwybronlyonefile.zip','TR08')
df08.to_csv (r'./df08.csv', index = None, header=True)

In [None]:
# Vamos a leer el dataset del 2018 para hacer el join
total18_fil = pd.read_csv('total18_fil.csv')

In [97]:
df18 = total18_fil
df18['TR18'] = df18.apply(lambda row: TotalRating(row),axis=1)

In [38]:
df18.shape

(168029, 26)

In [39]:
df_list = [df18,df17,df16,df15,df14,df13,df12,df11,df10] 

for df in df_list:
    df.drop(['DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060'], axis=1, inplace = True)

In [40]:
df_all = df18

for df in df_list[1:]:
    df_all = pd.merge(df_all, df, how='left', on=['STATE_CODE_001','STRUCTURE_NUMBER_008'])

In [41]:
df_all.head(10)

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,LAT_016,LONG_017,TRAFFIC_LANES_ON_028A,MEDIAN_CODE_033,DEGREES_SKEW_034,STRUCTURE_KIND_043A,STRUCTURE_TYPE_043B,MAIN_UNIT_SPANS_045,MAX_SPAN_LEN_MT_048,STRUCTURE_LEN_MT_049,DECK_WIDTH_MT_052,SURFACE_TYPE_108A,YEAR_BUILT_027,ADT_029,DESIGN_LOAD_031,PERCENT_ADT_TRUCK_109,MAINTENANCE_021,FUNCTIONAL_CLASS_026,WATERWAY_EVAL_071,YEAR_RECONSTRUCTED_106,TR18,TR17,TR16,TR15,TR14,TR13,TR12,TR11,TR10
0,1,13771700120,34465400.0,87390000.0,2,0,0,steel,2,1,17.7,18.9,9.8,bituminous,1958.0,3620.0,heavy,7.0,5.0,urban,none,0.0,6.5,6.5,7.0,7.0,8.0,8.0,8.0,8.0,8.0
1,1,40,33304800.0,86480500.0,4,2,0,steel,2,1,8.2,8.2,44.5,bituminous,1910.0,13000.0,other,1.0,4.0,urban,none,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,1,50,34575357.0,86371644.0,2,0,20,steel,2,1,9.4,9.4,6.7,concrete,1910.0,200.0,light,1.0,2.0,rural,high,1995.0,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5
3,1,60,34101800.0,86504800.0,3,0,0,concrete,2,3,10.4,34.7,15.2,concrete,1994.0,760.0,heavy,1.0,4.0,urban,none,0.0,7.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
4,1,83,34531863.0,86243520.0,1,0,20,steel,2,2,3.7,7.3,4.0,concrete,1914.0,45.0,heavy,0.0,2.0,rural,high,0.0,6.2,6.2,6.2,6.2,6.2,6.2,4.0,4.0,4.0
5,1,85,32093843.0,86582940.0,1,0,0,steel,2,8,4.0,18.6,5.0,concrete,1915.0,25.0,light,0.0,2.0,rural,low,1996.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
6,1,136,32450279.0,85333220.0,1,0,0,steel,2,5,6.0,30.3,4.8,concrete,1920.0,25.0,other,0.0,2.0,rural,high,0.0,4.0,4.0,4.0,5.2,5.2,5.2,5.2,5.2,6.0
7,1,138,32502400.0,85461200.0,1,0,0,steel,2,1,12.2,12.2,5.2,concrete,1920.0,175.0,light,0.0,4.0,rural,high,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
8,1,147,33020433.0,85310223.0,1,0,0,steel,2,1,6.1,7.0,4.9,bituminous,1920.0,20.0,other,0.0,2.0,rural,high,0.0,5.2,5.2,5.2,5.2,5.2,6.0,6.0,6.0,6.0
9,1,163,32571214.0,85425920.0,1,0,0,steel,2,1,12.2,12.2,4.1,concrete,1920.0,50.0,other,0.0,2.0,rural,high,0.0,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5


In [42]:
df_all.shape

(168354, 31)

In [54]:
# La razón por la que hemos ampliado el dataset a todos los puentes es porque en Pennsylvania, 
# donde inicialmente lo ibamos a hacer, cuando hacemos el join nos quedamos con 5 puentes con rating
# válido de 2010 hacia atrás.

df_all[(df_all['STATE_CODE_001'] == 42)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3928 entries, 122363 to 126290
Data columns (total 31 columns):
STATE_CODE_001            3928 non-null int64
STRUCTURE_NUMBER_008      3928 non-null object
LAT_016                   3928 non-null float64
LONG_017                  3928 non-null float64
TRAFFIC_LANES_ON_028A     3928 non-null int64
MEDIAN_CODE_033           3928 non-null int64
DEGREES_SKEW_034          3928 non-null int64
STRUCTURE_KIND_043A       3928 non-null object
STRUCTURE_TYPE_043B       3928 non-null int64
MAIN_UNIT_SPANS_045       3928 non-null int64
MAX_SPAN_LEN_MT_048       3928 non-null float64
STRUCTURE_LEN_MT_049      3928 non-null float64
DECK_WIDTH_MT_052         3928 non-null float64
SURFACE_TYPE_108A         3928 non-null object
YEAR_BUILT_027            3928 non-null float64
ADT_029                   3928 non-null float64
DESIGN_LOAD_031           3928 non-null object
PERCENT_ADT_TRUCK_109     3928 non-null float64
MAINTENANCE_021           3928 non-nul

In [55]:
# Si consideramos todo el dataset vamos perdiendo datos pero seguimos con un número considerable de entradas

df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168354 entries, 0 to 168353
Data columns (total 31 columns):
STATE_CODE_001            168354 non-null int64
STRUCTURE_NUMBER_008      168354 non-null object
LAT_016                   168354 non-null float64
LONG_017                  168354 non-null float64
TRAFFIC_LANES_ON_028A     168354 non-null int64
MEDIAN_CODE_033           168354 non-null int64
DEGREES_SKEW_034          168354 non-null int64
STRUCTURE_KIND_043A       168354 non-null object
STRUCTURE_TYPE_043B       168354 non-null int64
MAIN_UNIT_SPANS_045       168354 non-null int64
MAX_SPAN_LEN_MT_048       168354 non-null float64
STRUCTURE_LEN_MT_049      168354 non-null float64
DECK_WIDTH_MT_052         168354 non-null float64
SURFACE_TYPE_108A         168354 non-null object
YEAR_BUILT_027            168354 non-null float64
ADT_029                   168354 non-null float64
DESIGN_LOAD_031           168354 non-null object
PERCENT_ADT_TRUCK_109     168354 non-null float64
MAINT

# CAMBIAR LOS NAN POR EL VALOR DEL AÑO ANTERIOR

In [None]:
# Antes de ejecutarlo en este dataset hay que cambiar los NaN por el valor del año anterior

def det_formula(row):
    
    m=0
    cont_rehab = 0

    for i in range(1,len(list_tr)):
        dif = row[list_tr[i-1]]-row[list_tr[i]]
        
        if dif >= 0: 
            m += dif
            
        else:
            dif = 0
            cont_rehab+=1
            m += dif
            
    return m/(i-cont_rehab)
    

df['DT'] = df.apply(lambda row: det_formula(row), axis=1)
