# Dataframe preparation

Once we have downloaded all the datasets we will be using in our project, the first thing we should do is preparing the dataframe we will be working on.

In [2]:
# Importing modules
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns',None)

In [44]:
rating_cols = ['STATE_CODE_001','STRUCTURE_NUMBER_008','DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060'] 

In [45]:
df17 = pd.read_csv('./data/2017hwybronlyonefile.zip',compression = 'zip',sep = ',',encoding='latin-1',usecols = rating_cols)

  interactivity=interactivity, compiler=compiler, result=result)


In [46]:
df17.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060
0,1,00000000000S702,8,8,7
1,1,00000000000S703,8,8,7
2,1,0000000000M0022,5,5,6
3,1,000000883039900,7,7,7
4,1,000001014002450,6,6,7


In [7]:
# Function for new variable: Total Rating (TR)

def TotalRating(row):
    dr = row['DECK_COND_058']
    supr = row['SUPERSTRUCTURE_COND_059']
    subr = row['SUBSTRUCTURE_COND_060']
    minval = min(dr, supr, subr)
    maxval = max(dr, supr, subr)
   
    if minval <= 4:
        rating = minval
        
    elif minval >= 8:
        rating = (dr+supr+subr)/3
        
    else:
        medval = dr+subr+supr-minval-maxval
        rating = 0.5*minval+0.2*maxval+0.3*medval

    return rating


In [76]:
filenames = []
for year in range(2011,2018):
    year = str(year)
    path = "./data/"+year+"hwybronlyonefile.zip"
    filenames.append(path)

In [47]:
df17 = df17[(df17['DECK_COND_058']!='N') & (df17['SUPERSTRUCTURE_COND_059']!='N') & (df17['SUBSTRUCTURE_COND_060']!='N') ]

In [48]:
# Pasamos los valores de rating a números
df17['DECK_COND_058'] = pd.to_numeric(df17['DECK_COND_058'])
df17['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(df17['SUPERSTRUCTURE_COND_059'])
df17['SUBSTRUCTURE_COND_060'] = pd.to_numeric(df17['SUBSTRUCTURE_COND_060'])

In [49]:
# Quitamos los nulls
df17.dropna(inplace=True)

In [50]:
# Creamos la columna con el Total Rating para cada año
df17['TR17'] = df17.apply(lambda row: TotalRating(row),axis=1)

In [51]:
df17.head(10)

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,TR17
0,1,00000000000S702,8,8,7,7.5
1,1,00000000000S703,8,8,7,7.5
2,1,0000000000M0022,5,5,6,5.2
3,1,000000883039900,7,7,7,7.0
4,1,000001014002450,6,6,7,6.2
5,1,000001331700710,5,5,5,5.0
6,1,000002,7,4,5,4.0
7,1,000004,4,4,4,4.0
8,1,000004504800350,7,7,7,7.0
9,1,000005,5,4,4,4.0


In [52]:
df17.drop(['DECK_COND_058','SUPERSTRUCTURE_COND_059','SUBSTRUCTURE_COND_060'], axis=1, inplace = True)

In [53]:
df17.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,TR17
0,1,00000000000S702,7.5
1,1,00000000000S703,7.5
2,1,0000000000M0022,5.2
3,1,000000883039900,7.0
4,1,000001014002450,6.2


In [33]:
# Vamos a leer el dataset del 2018 para hacer el join
total18_fil = pd.read_csv('total18_fil.csv')

In [35]:
df18 = total18_fil
df18['TR18'] = df18.apply(lambda row: TotalRating(row),axis=1)

In [36]:
df18.shape

(175597, 26)

In [41]:
# Since we are looking for joining all the datasets with this one by using the Structure number, we should first check that there are no duplicates on it
df18['STRUCTURE_NUMBER_008'].duplicated().value_counts()

False    168029
True       7568
Name: STRUCTURE_NUMBER_008, dtype: int64

In [42]:
duplicateRowsDF = df18[df18.duplicated(['STRUCTURE_NUMBER_008'])]
 
print("Duplicate Rows based on a single column are:", duplicateRowsDF, sep='\n')


Duplicate Rows based on a single column are:
        STATE_CODE_001 STRUCTURE_NUMBER_008     LAT_016     LONG_017  \
4887                 4                 7596  33341889.0  112055871.0   
4888                 4                 7599  33392236.0  111583868.0   
4890                 4                 7630  32080210.0  110593544.0   
4911                 4                 7960  33260652.0  112000059.0   
4916                 4                 7976  33260645.0  112000845.0   
4932                 4                 8273  32161600.0  110545144.0   
4933                 4                 8299  31592323.0  110335599.0   
4934                 4                 8318  32010558.0  111042867.0   
4937                 4                 8460  32205462.0  111031169.0   
4938                 4                 8482  31258000.0  110577000.0   
4957                 4                 8643  31225000.0  110506000.0   
4978                 4                 8842  32422050.0  114413590.0   
4992               

In [37]:
df17.shape

(469890, 2)

In [56]:
df2 = pd.merge(df18, df17, how= 'left', on = ['STATE_CODE_001','STRUCTURE_NUMBER_008'])

In [57]:
df2.shape

(175599, 27)

In [80]:
filenames = []
for year in range(2013,2018):
    year = str(year)
    path = "./data/"+year+"hwybronlyonefile.zip"
    filenames.append(path)

In [83]:
filenames

['./data/2013hwybronlyonefile.zip',
 './data/2014hwybronlyonefile.zip',
 './data/2015hwybronlyonefile.zip',
 './data/2016hwybronlyonefile.zip',
 './data/2017hwybronlyonefile.zip']

In [85]:
def createTR (filename,colname):
        
    # Create dataframes
    df = pd.read_csv(filename, compression = 'zip',sep = ',',encoding='latin-1',usecols = rating_cols)
    
    # Remove 'N'
    df = df[(df['DECK_COND_058']!='N') & (df['SUPERSTRUCTURE_COND_059']!='N') & (df['SUBSTRUCTURE_COND_060']!='N') ]
    
    # To numeric
    df['DECK_COND_058'] = pd.to_numeric(df['DECK_COND_058'])
    df['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(df['SUPERSTRUCTURE_COND_059'])
    df['SUBSTRUCTURE_COND_060'] = pd.to_numeric(df['SUBSTRUCTURE_COND_060'])
    
    # Remove nulls
    df.dropna(inplace=True)
    
    # Creamos la columna con el Total Rating para cada año
    df[colname] = df.apply(lambda row: TotalRating(row),axis=1)
    
    return df
    

In [88]:
df17 = createTR('./data/2017hwybronlyonefile.zip','TR17')

  if (yield from self.run_code(code, result)):


In [89]:
df16 = createTR('./data/2016hwybronlyonefile.zip','TR16')