<a href="https://colab.research.google.com/github/susanemcg/data_wrangling_exercises/blob/master/chapter_4_examples/jupyter_notebooks/fixed_width_parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# An example of reading data from a fixed-width file with Python.
# The source file for this example comes from NOAA and can be accessed here:
# https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
# The metadata for the file can be found here:
# https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt

In [None]:
# import the `csv` library, to create our output file
import csv

In [None]:
# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB
# # Import PyDrive and associated libraries.
# # This only needs to be done once per notebook.
# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

# # Authenticate and create the PyDrive client.
# # This only needs to be done once per notebook.
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

In [None]:
# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB
# # Link to data file stored in Drive: https://drive.google.com/file/d/1nhYjc6ZadE9bPk-6pzv99gIzSudBmHGm/view?usp=sharing
# file_id = '1nhYjc6ZadE9bPk-6pzv99gIzSudBmHGm' # notice where this string comes from in link above

# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file
# print(imported_file['title'])  # it should print the title of desired file
# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive

In [None]:
filename = "ghcnd-stations"

# reading from a basic text file doesn't require any special libraries
# so we'll just open the file in read format ("r") as usual
source_file = open(filename+".txt", "r")

# the built-in "readlines()" method does just what you'd think:
# it reads in a text file and converts it to a list of lines
stations_list = source_file.readlines()

# as usual, we'll create an output file to write to
output_file = open(filename+".csv","w")

# and we'll use the `csv` library to create a "writer" that gives us handy
# "recipe" functions for creating our new file in csv format
output_writer = csv.writer(output_file)

In [None]:
# create the header list
headers = ["ID","LATITUDE","LONGITUDE","ELEVATION","STATE","NAME","GSN_FLAG",
 "HCNCRN_FLAG","WMO_ID"]

# write our headers to the output file
output_writer.writerow(headers)

In [None]:
# loop through each line of our file (multiple "sheets" are not possible)
for line in stations_list:
    # create an empty list, to which we'll append each set of characters that
    # makes up a given "column" of data
    new_row = []
    # ID: positions 1-11
    new_row.append(line[0:11])
    # LATITUDE: positions 13-20
    new_row.append(line[12:20])
    # LONGITUDE: positions 22-30
    new_row.append(line[21:30])
    # ELEVATION: positions 32-37
    new_row.append(line[31:37])
    # STATE: positions 39-40
    new_row.append(line[38:40])
    # NAME: positions 42-71
    new_row.append(line[41:71])
    # GSN_FLAG: positions 73-75
    new_row.append(line[72:75])
    # HCNCRN_FLAG: positions 77-79
    new_row.append(line[76:79])
    # WMO_ID: positions 81-85
    new_row.append(line[80:85])

    # now all that's left is to use the
    # `writerow` function to write new_row to our output file
    output_writer.writerow(new_row)

In [None]:
# officially close the `.csv` file we just wrote all that data to
output_file.close()

In [None]:
# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB
# from google.colab import files

# files.download(filename+".csv")