## Preprocessing

### Import

In [None]:
# Import 
import pandas as pd

# Import custom libraries
import util
from util import UtilityFunctions as uf
from analysis import Analysis as an

# reload the custom library, 
# Need to be executed every time new functions are added to util.py 
from importlib import reload
reload(util)   

# Monkey patch the method from the utility class to the pandas DataFrame
pd.DataFrame.filter_features = uf.filter_features
pd.DataFrame.filter_numerical_values = uf.filter_numerical_values
pd.DataFrame.filter_negative_values = uf.filter_negative_values
pd.DataFrame.filter_columns_with_less_unique_values_than_threshold = uf.filter_columns_with_less_unique_values_than_threshold
pd.DataFrame.drop_columns = uf.drop_columns

### Select country codes to process

In [None]:
# select the country codes
countries = [(208, 'Denmark'), (578, 'Norway'), (276, 'Germany'), (840, 'USA')]

### Select raw data files

In [None]:
raw_files =  uf.get_csv_files_from_folder()
print(raw_files)

### Define features to remove based on handbook

In [None]:
columns_to_drop = [
    'uniqid', 'ivlength', 'ivstart', 'ivstend', 'ivdate', 'reg_iso', 
    'size_5c', 'gwght', 'respint','X002_02A', 'doi_gesis','doi_wvsa', 
    'cntry_AN', 'lnge_iso', 'version', 'reg_nuts1', 'reg_nuts2',
    'X002_02B','V002A', 'V002A_01','V001A', 'V001A_01', 'X003R',  
    'X003R2', 'mode', 'intrvwr_id' 
]

### Process and save files

In [None]:
# process the raw files reducing size and initial cleaning removing data not to be used
# such as missing values, negative values, columns with less unique values than a threshold
# and columns with only one value
# none numerical values are also removed
for csv_file in raw_files:
    dataframe = pd.read_csv(csv_file)
    year = str(dataframe['year'].head(1).iloc[0])
    version = str(dataframe['versn_s'].head(1)).split('(')[1].split(')')[0]
    print(year + " " + version)
    for country_code, name in countries:
        country_dataframe = dataframe[dataframe['cntry'] == country_code]    
        country_dataframe = country_dataframe.filter_numerical_values()
        country_dataframe = country_dataframe.filter_negative_values()
        country_dataframe = country_dataframe.filter_columns_with_less_unique_values_than_threshold(2)
        country_dataframe = uf.drop_columns(country_dataframe, columns_to_drop)
        file_name = str(year) + "_" + name + "_" + str(version)
        uf.save_dataframe(country_dataframe, file_name)        

### Filter out features

In [None]:
# Remove or only select features that are relevant for the analysis
