In [1]:
import os
import random
import pickle
import itertools
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
train_path = "train.tsv"
df = pd.read_csv(train_path, sep="\t")

In [2]:
def save_obj(obj:object,name:str):
    ext = '.pickle'
    with open(name + ext, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_obj(name:str)->object:
    ext = '.pickle'
    with open(name + ext, 'rb') as handle:
        return pickle.load(handle)


In [4]:
def combine_lists(list1:list[str], list2:list[str])->list[str]:
    """
    Combines two lists into one.

    Args:
        list1: list of strings
        list2: list of strings

    Returns:
        list of strings
    """
    temp = list(map(
            lambda a: list(map(lambda b: "a) "+str(a)+" b) "+str(b) ,list1))
            , list2))
    # flatten
    return list(itertools.chain.from_iterable(temp))

# essay set 1

In [5]:
essaySet_1_model_answers = ['You need to know how much vinegar was used in each container.',
 'You need to know what type of vinegar was used in each container.',
 ' You need to know what materials to test.',
 'You need to know what size/surface area of materials should be used.',
 'You need to know how long each sample was rinsed in distilled water.',
 'You need to know what drying method to use.',
 'You need to know what size/type of container to use.',] 

In [6]:
# random sample 2 answers from essay set 1
essaySet_1_answers = df.query("EssaySet == 1 and Score1 == Score2 and Score1 == 3")["EssayText"]
essaySet_1_answers = essaySet_1_answers.sample(22-7)
essaySet_1_answers = essaySet_1_answers.values.tolist()
essaySet_1_answers[0]

'Some additional information I would need in order to replicate this experiment would be the amount of vinegar poured into each seperate containers. If I did not have the data table, I would also need to be told in the procedure what the four different samples are. It would also be helpful to know what type of containers were used. In order to replicate on experiement, the procedure must be precise and not have any information out, or else the data may come out completely different and not be comperable to the first set of data.'

In [7]:
# essaySet_1_answers = list(itertools.chain.from_iterable(essaySet_1_answers))
essaySet_1_model_answers += essaySet_1_answers
# save
save_obj(essaySet_1_model_answers, "essaySet_1_model_answers")

In [8]:
len(essaySet_1_model_answers), len(essaySet_1_answers)

(22, 15)

# essay set 2

In [9]:
essaySet_2_model_answers_part_a = ["Plastic sample B has more stretchability than the other polymer plastics.",
    "Plastic sample A has the least amount of stretchability compared to the other polymer plastics.",
    "Not all polymer plastics have the same stretchability.",
    "Different polymer plastics have different stretchability (and are therefore suited for different applications).",
    "A reasonable conclusion cannot be drawn due to procedural errors.",]
essaySet_2_model_answers_part_b = ["Provide the before and after measurements for length (Did the samples all start out the same size?).",
    "Make sure the samples are all of the same thickness. Variations in thickness could have caused variations in stretchability.",
    "Perform additional trials. Some of the samples have similar stretchability (A and C, B and D). Two trials may not be enough to conclusively state that one is more stretchable than the other.",
    "Indicate how many weights were added to the clamps (Was it the same number for each sample?).",]
essaySet_2_model_answers = combine_lists(essaySet_2_model_answers_part_a, essaySet_2_model_answers_part_b)
len(essaySet_2_model_answers)

20

In [10]:
# random sample 2 answers from essay set 2
essaySet_2_answers = df.query("EssaySet == 2 and Score1 == Score2 and Score1 == 3")["EssayText"]
essaySet_2_answers = essaySet_2_answers.sample(2).values.tolist()
# essaySet_2_answers = list(itertools.chain.from_iterable(essaySet_2_answers))
essaySet_2_model_answers += essaySet_2_answers
print(len(essaySet_2_model_answers))
# save
save_obj(essaySet_2_model_answers, "essaySet_2_model_answers")

22


# essay set 3 is a comprihension paragraph

In [11]:
def sample_wrt_length(df:pd.DataFrame, nsamples:int, nshort:int, nlong:int, nrand:int)->pd.DataFrame:
    """
    Samples (nrand + nshort + nlong) essays from df based on the length of the essay.

    Args:
        df: pandas dataframe
        nsamples: number of samples to to sample
        nshort: number of short essays to sample
        nlong: number of long essays to sample
        nrand: number of random essays to sample

    Returns:
        list strings of sampled essays
    """
    # random sample nsamples essays from df
    df = df.sample(nsamples)
    # pick longest nlong and shortest nshort essays
    # then nrand random essays from the remaining nsamples

    sorted_samples = sorted(df.values.tolist(), key=len, reverse=True)

    try:
        random_samples = pd.DataFrame(sorted_samples[nshort+6:-(nlong+7)]).sample(nrand).values.tolist()
    except:
        random_samples = pd.DataFrame(sorted_samples).sample(nrand).values.tolist()

    # flatten
    random_samples = list(itertools.chain.from_iterable(random_samples))

    df_short = sorted_samples[:nshort]
    df_long = sorted_samples[-nlong:]

    if not (isinstance( df_long, list)):
        df_long = [sorted_samples[:nlong]]

    if not (isinstance( df_short, list)):
        df_short = [sorted_samples[:nshort]]

    if not (isinstance(random_samples , list)):
        random_samples = [random_samples]

    samples = random_samples+ df_long+ df_short
    return samples

In [12]:
# picked at random
essaySet_3_answers = df.query("EssaySet == 3 and Score1 == Score2 and Score1==2")["EssayText"]
len(essaySet_3_answers), essaySet_3_answers.values[:2]

(291,
 array(["China's panda is similar to Australia's Koala because in the article it states that the panda only eats bamboo and the koala eats eucalyptus leaves constantly. they both only eat one thing and are different from pythons because they eat multiple things (not plants) and are more vicious.",
        'Pandas and koalas are similar because they both almost exclusively eat just one food, pandas eat bamboo, and koalas eat eucalyptus leaves. Both animals are different from the python because pythons can adapt to different food sources while pandas and koalas can not.'],
       dtype=object))

In [13]:
# random sample 100 essays from essay set 3
# pick longest 5 and shortest 5 essays
# then 9 random essays from the remaining 100

essaySet_3_model_answers = sample_wrt_length(essaySet_3_answers, 100, 8, 5, 22-5-8)
save_obj(essaySet_3_model_answers, "essaySet_3_model_answers")

In [14]:
len(essaySet_3_model_answers), essaySet_3_model_answers[:2]

(22,
 ['Pandas in china are similar to koala bears in Australia because they are both specialists. They both have only eats one type of food. Koalas and pandas are both different from pythons have many different places they live in the world. Python are generalists.',
  'Pandas in China are similar to koalas in Australia because they  both eat leaves and plants. They are not carnivors like pythons. According to the article it states " A specialist is China\'s panda which eats almost nothing but bamboo, or Australia\'s koala bear which eats eucalyptus leaves most exclusively. The quote already states pandas and koalas do not like meat.'])

# essay set 3 and 4 are a comprihension paragraph

In [15]:
essaySet_4_answers = df.query("EssaySet == 4 and Score1 == Score2 and Score1 == 2")["EssayText"]
len(essaySet_4_answers), essaySet_4_answers.values[:2]

(86,
 array(['The word "invasive" means intruding. Invasive species, like the python in the everglades are beginning to threat native species. They are new animals that threat biodiversity. These animals can cause animals to become extinct and are generalist so they can adapt to different environments and affect other people and organism\'s in and existence.',
        'The word, "invasive" plays a very important role in the article. Biologists, such as Skip Snow claim that these reptilian species of pythons and lizards are "invasive". This conclusion was reached by their adaptability to multiple environments and disrupting the ecosystems. However, some such as RobRay MacInnes, argue otherwise, "Invasive is the rod of the controversy over these reptiles.'],
       dtype=object))

In [16]:
# random sample 100 essays from essay set 4
# pick longest 2 and shortest 2 essays
# then 2 random essays from the remaining 60
essaySet_4_model_answers = sample_wrt_length(essaySet_4_answers, 60, 8, 5, 22-5-8)

save_obj(essaySet_4_model_answers, "essaySet_4_model_answers")

In [17]:
essaySet_4_model_answers[0:2]

['The word invasive means that a species was brought into an area, not by means of being a native species. The article deals with contrasts between snakes being invasive to areas such as the Everglades or if they are introduced. Invasive is a negative connotation because invasive species are usually threats to biodiversity. The article deals whether or not this is true of this case.',
 'The word "invasive" helps to create a debate in the article. Invasive species are animals that are introduced into an envoriment and thrive in it possible affecting other animals. The scientist feels the term "invasive species is unfair", referring to pythons. Biologists, however feel invasive species are major threats to biodiversity. The word "invasive" helps to provide debate on this article.']

# essay set 5 

need to mention atleast four elements of those to get full mark


In [18]:
# def permute_answers(answers:list[str], low:int, high:int, n_samples:int)->list[str]:
#     def sample()->list[str]:
#         n = random.choice(list(range(low,high)))
#         return random.sample(answers, n)

#     permuted_answers = [sample() for _ in range(n_samples)]
#     # join each answer with a space
#     return [" ".join(x) for x in permuted_answers]
def sample_answers_with_permutation(answers:list[str], low:int, high:int, n_samples:int, bias:int = 2)->list[str]:

    n_answers = list(range(low,high))
    # student wont just answer with only the minimum number of valid answers
    # add more baises to minimum number of answers
    bias = [bias]
    # post padding
    bias.extend([0]*(len(n_answers)-1))

    weights = np.array(list(reversed(n_answers))) + np.array(bias)
    weights = weights/ weights.sum()

    def sample()->list[str]:
        
        n = np.random.choice(n_answers, 1,p = weights , replace=False).tolist()[0]
        return random.sample(answers, n)

    permuted_answers = [sample() for _ in range(n_samples)]
    # join each answer with a space
    return [" ".join(x) for x in permuted_answers]

In [19]:
discrete_answers = ["mRNA exits nucleus via nuclear pore.",
    "mRNA travels through the cytoplasm to the ribosome or enters the rough endoplasmic reticulum.",
    "mRNA bases are read in triplets called codons (by rRNA).",
    "tRNA carrying the complementary (U=A, C+G) anticodon recognizes the complementary codon of the mRNA.",
    "The corresponding amino acids on the other end of the tRNA are bonded to adjacent tRNA's amino acids.",
    "A new corresponding amino acid is added to the tRNA.",
    "Amino acids are linked together to make a protein beginning with a START codon in the P site (initiation).",
    "Amino acids continue to be linked until a STOP codon is read on the mRNA in the A site (elongation and termination).",]

In [20]:
# permutation of the answers for discrete_answers of size 4 each with replacement
discrete_answers_perm = sample_answers_with_permutation(discrete_answers, 4, len(discrete_answers)-1, 6)
len(discrete_answers_perm)

6

In [21]:
discrete_answers_perm[5]

'Amino acids are linked together to make a protein beginning with a START codon in the P site (initiation). mRNA travels through the cytoplasm to the ribosome or enters the rough endoplasmic reticulum. A new corresponding amino acid is added to the tRNA. Amino acids continue to be linked until a STOP codon is read on the mRNA in the A site (elongation and termination). tRNA carrying the complementary (U=A, C+G) anticodon recognizes the complementary codon of the mRNA.'

In [22]:
essaySet_5_answers = df.query("EssaySet == 5 and Score1 == Score2 and Score1 == 3")["EssayText"]
len(essaySet_5_answers), essaySet_5_answers.values[:2]

(32,
 array(['Four major steps involved in protein synthesis are as follows, mRNA enters a ribosome. Then, tRNA delivers amino acids to the ribosome. The mRNA codon matches with the tRNA anti-codon. Also, the amino acids are joined by a condensation reaction. This process is repeated until a stop code is reached.',
        "When the mRNA leaves the nucleus, it fits through the holes by virtue of being single-stranded. The mRNA floats around until it encounters a ribosome. The ribosome latches on to the mRNA with the larger portion nesting with and reading three nitrogenous bases, called a codon, at a time. The ribosome says in semi-stasis until a tRNA with the proper corresponding nitrogenous bases bumps into it. The tRNA is held by the ribosome until the second codon has been read, and a second tRNA is bonded to by the Ribosome. A peptide (dehydration synthesis) bond forms between the two amino acids, and the initial tRNA is 'cut loose' The ribosome then moves on down the mRNA creatin

In [23]:
# random sample 100 essays from essay set 5
essaySet_5_answers_sample = essaySet_5_answers.sample(32)
# pick longest and shortest essay
# then 2 random essays from the remaining 100

essaySet_5_answers_sample_sorted = sorted(essaySet_5_answers_sample.values.tolist(), key=len, reverse=True)
essay_rand = pd.DataFrame(essaySet_5_answers_sample_sorted[5:-7]).sample(7).values.tolist()

essaySet_5_model_answers = essay_rand+ essaySet_5_answers_sample_sorted[:3] +\
                            essaySet_5_answers_sample_sorted[-6:]
essaySet_5_model_answers = essaySet_5_model_answers + discrete_answers_perm
print(len(essaySet_5_model_answers))
save_obj(essaySet_5_model_answers, "essaySet_5_model_answers")

22


# essay set 6

need to mention atleast Three elements of those to get full mark


In [24]:
discrete_answers = ["Selective permeability is used by the cell membrane to allow certain substances to move across.",
    "Passive transport occurs when substances move from an area of higher concentration to an area of lower concentration.",
    "Osmosis is the diffusion of water across the cell membrane.",
    "Facilitated diffusion occurs when the membrane controls the pathway for a particle to enter or leave a cell.",
    "Active transport occurs when a cell uses energy to move a substance across the cell membrane, and/or a substance moves from an area of low to high concentration, or against the concentration gradient.",
    "Pumps are used to move charged particles like sodium and potassium ions through membranes using energy and carrier proteins.",
    "Membrane-assisted transport occurs when the membrane of the vesicle fuses with the cell membrane forcing large molecules out of the cell as in exocytosis.",
    "Membrane-assisted transport occurs when molecules are engulfed by the cell membrane as in endocytosis.",
    "Membrane-assisted transport occurs when vesicles are formed around large molecules as in phagocytosis.",
    "Membrane-assisted transport occurs when vesicles are formed around liquid droplets as in pinocytosis.",
    "Protein channels or channel proteins allow for the movement of specific molecules or substances into or out of the cell.",]

In [25]:
discrete_answers_perm = sample_answers_with_permutation(discrete_answers, 3, len(discrete_answers)-2, 12)
len(discrete_answers_perm)

12

In [26]:
discrete_answers_perm[5]

'Passive transport occurs when substances move from an area of higher concentration to an area of lower concentration. Membrane-assisted transport occurs when molecules are engulfed by the cell membrane as in endocytosis. Membrane-assisted transport occurs when the membrane of the vesicle fuses with the cell membrane forcing large molecules out of the cell as in exocytosis.'

In [27]:
essaySet_6_answers = df.query("EssaySet == 6 and Score1 == Score2 and Score1 == 3")["EssayText"]
len(essaySet_6_answers), essaySet_6_answers.values[:2]

(44,
 array(["1.     -Osmosis is how water gets diffused thru the membrane. 2.     -Active Transport is where an enzyme opens the cell membrane for an object to come in, and extra energy is needed to assist the object into the cell.3.     -Passive Transport is where an enzyme opens the cell, but the object doesn't need the extra energy to come in.",
        'One of the ways cells can move substances across the cell membrane is by diffusion. Diffusion moves substances from an area of high concentration, to an area of low concentration. Another way cells move substances across the cell membrane is by osmosis. Osmosis moves water down the cell membrane. The last way cells move substances across the cell membrane is by the sodium-potassium pump. The sodium-potassium pump takes sodium ions out of the membrane and puts potassium ions into the membrane.'],
       dtype=object))

In [28]:
# random sample 100 essays from essay set 5
essaySet_6_answers_sample = essaySet_6_answers.sample(30)
# pick longest 2 and shortest 4 essay
# then 4 random essays from the remaining 100

essaySet_6_answers_sample_sorted = sorted(essaySet_6_answers_sample.values.tolist(), key=len, reverse=True)
essay_rand = pd.DataFrame(essaySet_6_answers_sample_sorted[5:-5]).sample(4).values.tolist()
essaySet_6_model_answers = essay_rand+ essaySet_6_answers_sample_sorted[:2] +\
                            essaySet_6_answers_sample_sorted[-4:]
essaySet_6_model_answers = essaySet_6_model_answers + discrete_answers_perm
print(len(essaySet_6_model_answers))
save_obj(essaySet_6_model_answers, "essaySet_6_model_answers")

22


# essay set 7, 8 and 9 are a comprihension paragraph

In [29]:
for essay in ["7","8","9"]:
    answers = df.query(f"EssaySet == {essay} and Score1 == Score2 and Score1 == 2")["EssayText"]
    print("essay set",essay,len(answers), answers.values[:1],sep=" ** ")
    # random sample 100 essays from essay set
    # pick longest 2 and shortest 2 essays
    # then 2 random essays from the remaining 100

    model_answers = sample_wrt_length(answers, 100, 6, 6, 10)
    save_obj(model_answers, f"essaySet_{essay}_model_answers")

essay set ** 7 ** 400 ** ["Rose is thoughtful and caring.  She has plenty of topics or activities in her life that she could be complaining about; however she puts others before herself.       For example in the story Aunt Kolab asks Rose if the work she does to help the family 'weighs her down'.  Rose did not want to tell her the truth because she did not want to hurt her aunt.  Therefore, one can see Rose cares about others emotions and can be characterized as thoughtful."]
essay set ** 8 ** 687 ** ["Paul finds out that Mr. Leonard was a track star but he could not read. 'No school wanted a runner who couldn't read.' Paul listened to Mr. Leonard about his past and realized that is was similiar to his present. Paul decided that because Mr. Leonard had helped him with track that he needed to help Mr. Leonard out with his reading. 'C'mon, Mr. Leonard, it's time to start your training.'"]
essay set ** 9 ** 517 ** ['The author organizes the article by catching your attention with shocking

# essay set 10

In [30]:
temp = ["Black :: The doghouse will be warmer. The black lid made the jar warmest.",
    "Dark gray :: The inside will be a little warmer,but not too hot. The dark gray lid increased 6º C more than the white.",
    "Light gray	::	The inside will stay cooler, but not too cool.	The light gray lid was 8º C cooler than the black.",
    "White :: The inside will be cooler. The white lid only went up to 42º C.",]

In [31]:
# choose random sample
essaySet_10_answers = df.query("EssaySet == 10 and Score1 == Score2 and Score1==2")["EssayText"]
len(essaySet_10_answers), essaySet_10_answers.values[:2]

(516,
 array(['black :: black might affect the dog house because then the doghouse can get to hot for the dog. iknow this because in the resul ts black got the hottest.',
        'white :: the color white will reflect the suns rays of f of it because in the lab the color white had the lowest tempeture, and black absorbed the heat so it was hotter and if they had the doghouse pai nted black the dog would get to hot and die. so white'],
       dtype=object))

In [33]:
# find all the answers that are starting with "Black"
rand_black = df.query("EssaySet == 10 and Score1 == Score2 and Score1==2 and EssayText.str.startswith('black')")["EssayText"]
# find all the answers that are starting with "light gray"
rand_light_gray = df.query("EssaySet == 10 and Score1 == Score2 and Score1==2 and EssayText.str.startswith('light gray')")["EssayText"]

# find all the answers that are starting with "dark gray"
rand_dark_gray = df.query("EssaySet == 10 and Score1 == Score2 and Score1==2 and EssayText.str.startswith('dark gray')")["EssayText"]

# find all the answers that are starting with "white"
rand_white = df.query("EssaySet == 10 and Score1 == Score2 and Score1==2 and EssayText.str.startswith('white')")["EssayText"]

In [34]:
# sample 5 from each
light_gray = rand_light_gray.sample(4).values.tolist()
dark_gray = rand_dark_gray.sample(4).values.tolist()
white = rand_white.sample(5).values.tolist()
black = rand_black.sample(5).values.tolist()

essy_10_rand = light_gray + dark_gray + white + black

esseySet_10_model_answers = essy_10_rand + temp
len(esseySet_10_model_answers)

22

In [35]:
save_obj(esseySet_10_model_answers, "essaySet_10_model_answers")

In [3]:
# load model answers
essaySet_1_model_answers = load_obj("essaySet_1_model_answers")
essaySet_2_model_answers = load_obj("essaySet_2_model_answers")
essaySet_3_model_answers = load_obj("essaySet_3_model_answers")
essaySet_4_model_answers = load_obj("essaySet_4_model_answers")
essaySet_5_model_answers = load_obj("essaySet_5_model_answers")
essaySet_6_model_answers = load_obj("essaySet_6_model_answers")
essaySet_7_model_answers = load_obj("essaySet_7_model_answers")
essaySet_8_model_answers = load_obj("essaySet_8_model_answers")
essaySet_9_model_answers = load_obj("essaySet_9_model_answers")
essaySet_10_model_answers = load_obj("essaySet_10_model_answers")

In [4]:
# all len of model answers
len(essaySet_1_model_answers), len(essaySet_2_model_answers), len(essaySet_3_model_answers), len(essaySet_4_model_answers), len(essaySet_5_model_answers), len(essaySet_6_model_answers), len(essaySet_7_model_answers), len(essaySet_8_model_answers), len(essaySet_9_model_answers), len(essaySet_10_model_answers)

(22, 22, 22, 22, 22, 22, 22, 22, 22, 22)

In [5]:
# to check if the answers arn't in shape
# flatten the list of lists
def flatten(ls_ls):
    out = []
    for ls in ls_ls:
        if isinstance(ls, list):
            out.append(ls[0])
        else:
            out.append(ls)
    return out

In [14]:
ess_dict = {
    "essaySet_1_model_answers": essaySet_1_model_answers,
    "essaySet_2_model_answers": essaySet_2_model_answers,
    "essaySet_3_model_answers": essaySet_3_model_answers,
    "essaySet_4_model_answers": essaySet_4_model_answers,
    "essaySet_5_model_answers": essaySet_5_model_answers,
    "essaySet_6_model_answers": essaySet_6_model_answers,
    "essaySet_7_model_answers": essaySet_7_model_answers,
    "essaySet_8_model_answers": essaySet_8_model_answers,
    "essaySet_9_model_answers": essaySet_9_model_answers,
    "essaySet_10_model_answers": essaySet_10_model_answers,
}

In [15]:
for i,v in ess_dict.items():
    ess_dict[i] = flatten(v)

In [16]:
for i,v in ess_dict.items():
    print(len(v))

22
22
22
22
22
22
22
22
22
22


In [19]:
for i,v in ess_dict.items():
    save_obj(v, i)