In [None]:
# Import 
import pandas as pd
import re

# Import custom libraries
import util
from util import UtilityFunctions as uf

# reload the custom library, 
# Need to be executed every time new functions are added to util.py 
from importlib import reload
reload(util)   

# Monkey patch the method from the utility class to the pandas DataFrame
pd.DataFrame.filter_features = uf.filter_features
pd.DataFrame.filter_numerical_values = uf.filter_numerical_values
pd.DataFrame.filter_negative_values = uf.filter_negative_values
pd.DataFrame.filter_columns_with_less_unique_values_than_threshold = uf.filter_columns_with_less_unique_values_than_threshold
pd.DataFrame.drop_columns = uf.drop_columns

In [None]:
countries = [(208, 'Denmark'), (578, 'Norway'), (276, 'Germany'), (840, 'USA')]



In [None]:
raw_files = sorted(uf.get_csv_files_from_folder())  # Sort the list of files in ascending order
print(raw_files)

['./../data/raw/WV3_Data_csv_v20180912.csv', './../data/raw/WV4_Data_csv_v20201117.csv', './../data/raw/WV5_Data_csv_v20180912.csv', './../data/raw/WV6_Data_csv_v20201117.csv', './../data/raw/WV7_Cross-National_Wave_7_csv_v6_0.csv']


In [None]:
columns_to_drop_wvs7 = [
    'uniqid', 'ivlength', 'ivstart', 'ivstend', 'ivdate', 'reg_iso', 
    'size_5c', 'gwght', 'respint', 'X002_02A', 'doi_gesis', 'doi_wvsa', 
    'cntry_AN', 'lnge_iso', 'version', 'reg_nuts1', 'reg_nuts2',
    'X002_02B', 'V002A', 'V002A_01', 'V001A', 'V001A_01', 'X003R',  
    'X003R2', 'mode', 'intrvwr_id', 'cntry_AN', 'cntrycow', 'year', 'fw_start', 'fw_end', 'mode',
    'mm_mixed_mode_EVS5', 'mm_mode_fu_EVS5', 'mm_matrix_group_EVS5', 'mm_fw_start_fu_EVS5',
    'mm_fw_end_fu_EVS5', 'mm_year_fu_EVS5', 'ivlength', 'ivstart', 'ivstend', 'ivdate',
    'mm_v277_fu_EVS5', 'mm_v278a_fu_r_EVS5', 'mm_v279a_fu_r_EVS5', 'lnge_num', 'lnge_iso',
    'gwght', 'pwght', 'wght_eq1000', 'reg_nuts1', 'reg_nuts2', 'reg_iso', 'size_5c', 'respint'
]

columns_to_drop_wvs6 = []

columns_to_drop_wvs5 = []

columns_to_drop_wvs4 = []

columns_to_drop = columns_to_drop_wvs7 + columns_to_drop_wvs6 + columns_to_drop_wvs5 + columns_to_drop_wvs4

In [None]:

country_features = ['C_COW_ALPHA','COW','cntry','"cntry"', 'V2', 'v2', '"C_COW_NUM"', 'C_COW_NUM']
year_variables = ['year', 'V237', 'V262','V238', 'V246','V260']


In [None]:


# List of possible delimiters
delimiters = [';', ',', '\t']

# Function to detect the delimiter dynamically
def detect_delimiter(file_path, delimiters):
    with open(file_path, 'r') as f:
        sample = f.readline()
        for delimiter in delimiters:
            if delimiter in sample:
                return delimiter
    # Default to ',' if no delimiter is detected
    return ','

# Assuming raw_files is a list of file paths
for csv_file in raw_files:  # Loop over each raw file
    print(f"[INFO] Processing {csv_file}")
    
    # Detect the delimiter
    delimiter = detect_delimiter(csv_file, delimiters)
    
    try:
        # Read the data from the file using the detected delimiter
        dataframe = uf.read_data(csv_file, delimiters=[delimiter])
    except Exception as e:
        print(f"[ERROR] Failed to read file {csv_file}: {e}")
        continue
    
    # Try to safely extract the version (e.g., WV4) from the file name using regex
    match = re.search(r'WV(\d+)', csv_file)
    if match:
        version = match.group(1)  # Extract the version number (e.g., 4 from WV4)
    else:
        # Handle cases where no version is found in the file name
        print(f"[WARNING] Skipping file {csv_file}, no version found in the file name.")
        continue  # Skip to the next file
    
    # Find the column name that corresponds to the country code
    country_code_feature_name = uf.find_country_feature_name(
        dataframe, countries=countries, country_features=country_features
    )
    
    # If no valid country code feature is found, skip to the next file
    if country_code_feature_name is None:
        print(f"[WARNING] No country_code_feature_name found! {csv_file}")
        continue
    
    # Process the dataframe for each country
    for country_code, name in countries:
        # Filter the dataframe for the rows where the country code matches
        country_dataframe = dataframe[dataframe[country_code_feature_name] == country_code]
        
        # Skip if no data for the given country
        if country_dataframe.empty:
            print(f"[WARNING] country_dataframe is empty! {csv_file}")
            continue
        
        # Apply cleaning steps
        try:
            country_dataframe = country_dataframe.filter_numerical_values()  # Remove non-numerical values
            country_dataframe = country_dataframe.filter_negative_values()  # Remove negative values
            country_dataframe = country_dataframe.filter_columns_with_less_unique_values_than_threshold(2)  # Drop columns with too few unique values
            country_dataframe = uf.drop_columns(country_dataframe, columns_to_drop)  # Drop specified columns
        except Exception as e:
            print(f"[ERROR] Cleaning failed for {csv_file}: {e}")
            continue
        
        # Save the cleaned dataframe to the processed folder
        file_name = f"WV{version}_{name}"  # Construct the file name
        try:
            uf.save_dataframe(country_dataframe, file_name)
            print(f"** Saved {file_name} **")
        except Exception as e:
            print(f"[ERROR] Failed to save file {file_name}: {e}")


[INFO] Processing ./../data/raw/WV3_Data_csv_v20180912.csv
  Found country code '840' in feature 'COW' for 'USA'
[INFO] Saved WV3_USA
[INFO] Processing ./../data/raw/WV4_Data_csv_v20201117.csv
  Found country code '840' in feature 'COW' for 'USA'
[INFO] Saved WV4_USA
[INFO] Processing ./../data/raw/WV5_Data_csv_v20180912.csv
  Found country code '578' in feature 'V2' for 'Norway'
[INFO] Saved WV5_Norway
[INFO] Saved WV5_Germany
[INFO] Saved WV5_USA
[INFO] Processing ./../data/raw/WV6_Data_csv_v20201117.csv
  Found country code '840' in feature 'COW' for 'USA'
[INFO] Saved WV6_USA
[INFO] Processing ./../data/raw/WV7_Cross-National_Wave_7_csv_v6_0.csv
  Found country code '840' in feature 'C_COW_NUM' for 'USA'
[INFO] Saved WV7_USA
