# Part 2 - lists, for loops, dictionaries, file handling

## Lists and loops

In [None]:
# lists are the most versatile data type in python, written as a list of comma separated values
# list objects denoted by square brackets []
cities = ["houston", "portland", "memphis", "chicago", "denver", "boston"]
print(cities)
print(type(cities))


In [None]:
# can access items in list by index - same rules as string indexing
print(cities[1])
print(cities[1:3])


In [None]:
# is an item in a list - membership operators in / not in (boolean)
print("phoenix" in cities)
print("phoenix" not in cities)


In [None]:
# can "loop" over items in a list using a for loop
# can loop over many datatypes, lists, strings, dictionaries
for city in cities:
    print(city)
    

In [None]:
## Ex 1
# check if any of the cities are in a list of midwest cities
cities = ["houston", "portland", "memphis", "chicago", "denver", "boston"]
midwest_cities = ["chicago", "omaha", "cleveland", "milwaukee"]

# loop over cities
for city in cities:
    if city in midwest_cities:
        print("{} is in the midwest".format(city))
        

In [None]:
# lists can contain different datatypes
dna_string = "CTGTCGT"
clubs = ["liverpool", "timbers"]

all_types_of_things = ["giraffe", 6, dna_string, clubs]

# what data type is each of the elements
for thing in all_types_of_things:
    print("{} is type: {}".format(thing, type(thing)))
    

In [None]:
# creating new lists - splitting strings.  default is whitespace
text_string = "lists are really versatile"
print(text_string)

# split string into list, assign to variable
words = text_string.split()
print(words)


In [None]:
# creating new lists - splitting strings. data files often tab delimited or comma separated
data_string = "brendan,jeffrey,colorado state,fort collins"
print(data_string)

# extracting useful information from a string
# split on delimiter
values = data_string.split(",")
print(values)


In [None]:
# using index, assign items in a list to variable
first_name = values[0]
last_name = values[1]
uni = values[2]
uni_city = values[3]

# # can put on same line
# first_name, last_name, uni, uni_city = values[0], values[1], values[2], values[3]

# # IF each line of data file always has same number of fields.  what if large number of fields but only need 3?
# first_name, last_name, uni, uni_city = data_string.split(",")

print("{} went to {} in {}".format(first_name, uni, uni_city))


In [None]:
## Ex 2
# creating new lists - adding items to an empty list
# for each number in a range of numbers from 1 to 10, append the square of that number to new list
# generate numbers using range()

# for i in range(1,11):
#     print(i)

# print(range(1,11))
# print([*range(1,11)])

# create empty list to hold result of each number squared
numbers_squared = []

# what methods available to list?
numbers_squared.append

# loop through a range of numbers, add the square of each number to the numbers_squared list
for i in range(1,11):
    print(i, i**2)
    numbers_squared.append(i**2)
    
print(numbers_squared)

## BONUS - List comprehensions!
numbers_squared_lc = [i**2 for i in range(1,11)]

print(numbers_squared_lc)

## Dictionaries

In [None]:
# a python dictionary is an unordered collection of data values or items.  
# unlike lists which only has a single value as an element, dictionaries contain key:value pairs
# dictionaries are used to retrieve a value when a key is known
# dictionary objects denoted by curly braces {}

city_states_dict = {"houston" : "texas", 
                   "memphis" : "tennessee",
                   "denver" : "colorado", 
                   "portland" : "maine"}

print(city_states_dict)


In [None]:
# what state is memphis in? you are 'indexing' by key 
print(city_states_dict["memphis"])


In [None]:
# loop through a list of cities and return the state
cities = ["houston", "portland"]
for city in cities:
    state = city_states_dict[city]
    print("{} is in {}".format(city, state))
    #print("{} is in {}".format(city, city_states_dict[city]))


In [None]:
# dictionaries have many methods available
print(dir(city_states_dict))

# print dicts keys
for key in city_states_dict.keys():
    print(key)


In [None]:
# what if key is not in dict? can check if dictionary contains key - membership operators in / not in
"new york" in city_states_dict


In [None]:
# use values() method if checking value membership
"colorado" in city_states_dict.values()


In [None]:
## Ex 2
# loop through cities and return state if city is in dictionary, default looking at dicts keys

cities = ["houston", "portland", "boston"]

for city in cities:
    if city in city_states_dict:
        print("{} is in {}".format(city, city_states_dict[city]))
    else:
        print("dont know where that is")


# Reading in from files and writing out to files

In [None]:
# using open() to open files
in_file = open("../data/mtb_fishers_results.txt")

# can read file line by line using readlines() method
contents = in_file.readlines()

# need to close the file handle after using
in_file.close()

# what does 'contents' look like
print(type(contents))
print(contents)


In [None]:
# # There has to be a better way! 
# with open("../data/mtb_fishers_results.txt") as f:
    
#     # loop through lines
#     for line in f:
#         print(line)
        

In [None]:
# open file, read line by line, adding item in first column to a list of locus_tags.  skip header?
in_file = "../data/mtb_fishers_results.txt"
locus_tags = []
with open(in_file) as f:
    # skip header row, or can save to variable
    next(f)
    #header = next(f)
    
    # loop through lines
    for line in f:
        
        # remove newline characters
        line = line.rstrip()
        #print(line)
        
        # split line on delimiter
        values = line.split("\t")
        #print(values, type(values))
        
        # assign to variable
        locus_tag = values[0]
        
        # append to locus_tag list
        locus_tags.append(locus_tag)
        
# print(locus_tags)

In [None]:
# write the contents of locus_tags to a new file - difference between print() and write()
out_file = "../results/locus_tags.txt"
with open(out_file , 'w') as f:

    # can write a header if you want
    f.write("locus_tag\n")
    
    # loop through locus_tag list object
    for locus_tag in locus_tags:
        
        # write to out file - difference between print() and write()
        print("{}".format(locus_tag))
        f.write("{}".format(locus_tag))
        #f.write("{}\n".format(locus_tag))

# Part 2 Final Excercise - Read mtb_gene_models.txt file 
###  - Make 3 dictionaries: start/end coord for locus_tag, strand for locus_tag, product for locus_tag
###  - What is the start coordinate of Rv0012?  What is the product of this gene?

In [None]:
# initialize empty dicts
strand_dict = {}
product_dict = {}
coord_dict = {}

# read gene models file
in_file = "../data/mtb_gene_models.txt"
with open(in_file) as f:
    # skip header
    next(f)
    
    # loop through lines in file
    for line in f:
        line = line.rstrip()
        #print(line)
        
        # separate columns into list
        values = line.split("\t")
        print(values)
        
        # assign values of interest to variables
        #locus_tag, start = values[0], values[1]
        locus_tag, start, end, strand, gene, product = line.split("\t") # careful of length of lists
        
        # populate dicts, using locus tag as key
        strand_dict[locus_tag] = strand
        product_dict[locus_tag] = product
        
        # the value of a dict can be a list (or another dict...)
        coord_dict[locus_tag] = [start, end]
        

In [None]:
# what are start coord and product for a specific gene?
gene_oi = "Rv0012"

product = product_dict[gene_oi]
print("{} encodes for product: {}".format(gene_oi, product))

# print start coord
start_coord = coord_dict[gene_oi]
#start_coord = coord_dict[gene_oi][0]

print("the start coord for {}: {}".format(gene_oi, start_coord))
