# Environment setup

Let's make sure our environment is setup correctly and save a requirements.txt file to prove it.

In [1]:
# This should show python 3.8.x. If not, stop now 
# and get your environment right.

from platform import python_version

print(python_version())

3.8.5


In [2]:
import os

print(os.environ['CONDA_DEFAULT_ENV'])

base


In [3]:
# These should be the only libraries you need in your environment.
# pandas, numpy, pyarrow

# % pip install numpy pandas pyarrow

import importlib.util

package_names = ['numpy', 'pandas', 'pyarrow']
for pkg in package_names:
    spec = importlib.util.find_spec(pkg)
    if spec is None:
        print('{} is not installed.'.format(pkg))
        print('Installing {}'.format(pkg))
        !{sys.executable} -m pip install pkg
    else:
        print('{} is installed.'.format(pkg))

numpy is installed.
pandas is installed.
pyarrow is installed.


In [4]:
# This will be a silent operation. You will see no output.
# Instead, the output is saved into a file called 'requirements.txt'
# in your working directory, which you should submit with your
# assignment to show that everything is OK.

!pip freeze > requirements.txt

### Read a feather file and return a dataframe. This is already done for you. You just have to call it from main to convert a feather file into a dataframe.

In [None]:
import pyarrow.feather as feather
import pandas as pd


In [None]:
def arrow_to_df(input_file_name):
    df = feather.read_feather(input_file_name)
    return df

### Write an feather file using a dataframe.

In [None]:
def df_to_arrow(output_file_name, df):
    feather.write_feather(df, output_file_name, compression='zstd')
    return

In [None]:
def write_ids_to_file(df,out_file):
    with open(out_file,'w') as f:
        int_arr = df['Id'].to_list() #changed Id to  Score
        str_arr = list(map(str,int_arr))
        f.write('\n'.join(str_arr))
        # If you wish to enable debug and see the output, uncomment
        # the the two lines below.
        #print_str = ' '.join(str_arr)
        #print ('{}:{}'.format(out_file, print_str))
    return

### Split the Posts dataframe into questions and answers based on the value of the PostTypeId, where a question = 1 and and  answer = 2. You should return 2 new dataframes, one containing only questions and the other containing only answers.

In [None]:
def split_df(df):
    ans_arr =df[df['PostTypeId'] ==2]
    ques_arr =df[df['PostTypeId'] ==1]
    df_to_arrow('Answers.feather',ans_arr)
    df_to_arrow('Questions.feather',ques_arr)
    return ans_arr,ques_arr

### Main Loop:
* First read the Posts.feather file into a dataframe using the provided function.
* Now get the split_df function working to generate the two new dataframes containing questions and answers.
* Now write out the two new dataframes -- Answers.feather and Questions.feather for later.
* Finally, you need to call write_ids_to file() for answers ands questions. The output file name should be "Answer-Ids.txt" and Question-Ids.txt"  This will by our sanity check that you got the split function correct. Make sure you use the output file names exactly as shown (case sensitive).

In [None]:
def main():
    #Reading post.feather file
    posts_df =arrow_to_df('Posts.feather')
    
    #using the split function
    ans,ques =split_df(posts_df)
    
    write_ids_to_file(ans,'Answer-Ids.txt')
    write_ids_to_file(ques,'Question-Ids.txt')
    
    
    return

In [None]:
# Here is the definition of the "main" loop execution function shown
# in Lectorial 2
if __name__ == '__main__':
    main()
    print ('[INFO] Script completed with no errors.')