In [None]:
import pandas as pd
import missingno as msno
import scipy.stats as ss
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from miceforest import ImputationKernel


# Loading of data

In [None]:
chunksize = 100000
df = pd.read_csv('food.csv', sep='\t')
# we only select rows where food_groups_en is not null
df = df.dropna(subset=['food_groups_en'])

# Low correlated column dropping

In [None]:
#correlations for categorical data
columns = list(df.select_dtypes(include=['object']).columns)
corr = pd.Series(index = columns)

for j in columns : 
    try :
        corr.loc[j] = cramers_v(df[j],df['food_groups_en'])
    except :
        print('Error, we put correlation = 0 for the column  :  ' + str(j))
        corr.loc[j] = 0


#numerical_columns = list(corr_numerical[abs(corr_numerical)>= 0.1].index)
#numerical_columns.remove('food_groups_en')
corr_categorical = corr.sort_values(ascending=False)
removed_columns=  list(corr_categorical[abs(corr_categorical)<= 0.2].index)
categorical_columns = list(corr_categorical[abs(corr_categorical)>= 0.2].index)

# we drop columns that contains informations about the category of the product, because we want to prevent from data leakage
# we are not supposed to have theses columns at the time of the prediction

columns_to_keep = categorical_columns
columns_to_delete = ['food_groups_tags','pnns_groups_1','pnns_groups_2','categories_en','categories_tags', 'main_category_en', 'main_category', 'categories']

removed_columns.extend([i for i in columns_to_delete if i not in removed_columns])
filtered_drop_data = df.drop(columns=removed_columns)


# Missing values column dropping

In [None]:
# Calculate the proportion of missing data in each column
missing_data = filtered_drop_data.isnull().mean()

# Set a threshold for the proportion of missing data (e.g., 70%)
threshold = 0.7

# Filter columns that meet the threshold criteria
filtered_columns = missing_data[missing_data <= threshold].index

# Create a new DataFrame with the filtered columns
filtered_data_mv = filtered_drop_data[filtered_columns]

# Remove additional columns that are not needed
filtered_data_mv = filtered_data_mv.drop(columns=["last_modified_datetime", "created_datetime", "created_t", "last_modified_t", "last_image_t", "last_image_t"])
filtered_data_mvnum=filtered_data_mv.dropna(subset=['sodium_100g',"nutrient_levels_tags","ingredients_text"])
filtered_data_mvnum=filtered_data_mv.dropna(subset=['proteins_100g',"nutriscore_grade"])

In [None]:
#DROP LINES BASED ON THE TRESHOLD 
# Create a list of column names to drop based on the threshold
treshold_2= 0.5
columns_to_drop = [col for col in filtered_data_mvnum.columns if filtered_data_mvnum[col].isna().mean() > treshold_2]

# Drop the selected columns from the DataFrame
filtered_data_mvnum = filtered_data_mvnum.drop(columns=columns_to_drop)

# IMPUTATION

In [None]:
# Calculate the mean value for each numerical column in the filtered data
mean_value = filtered_data_mvnum.mean()

# Perform mean imputation by filling missing values with column means
mean_imputation = filtered_data_mvnum.select_dtypes(include=['number']).fillna(mean_value)

# Set a threshold (60%) for the number of columns with NaN within the same 'food_groups' category

# Group the DataFrame by the 'food_groups' column
grouped = filtered_data_mvnum.groupby('food_groups')

# Create a list to store the corrected groups
filled_groups = []

# Iterate through each group and fill NaN values with 0 if more than 60% of columns in that group have NaN
for name, group in grouped:
    # Calculate the proportion of missing values in each column of the group
    missing_data = group.isnull().mean()
    
    # Filter numerical columns with more than 60% NaN
    columns_to_fill_with_zero = missing_data[missing_data > threshold].index
    columns_to_fill_with_mean = missing_data[missing_data <= threshold].index

    # Fill missing values with 0 in the selected columns
    group[columns_to_fill_with_zero] = group[columns_to_fill_with_zero].fillna(0)
    
    # Fill missing values with the mean in the selected numerical columns
    for column in columns_to_fill_with_mean:
        if pd.api.types.is_numeric_dtype(group[column]):
            group[column] = group[column].fillna(group[column].mean())

    # Add the corrected group to the list
    filled_groups.append(group)

# Concatenate the corrected groups into a single DataFrame
imputed_data = pd.concat(filled_groups)

# Reset the indices for the resulting DataFrame
imputed_data = imputed_data.reset_index(drop=True)
