# Missing Values

In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
def read_in_csv(file):
  return pd.read_csv(file)

## Drop the unnamed column

In [61]:
def drop_columns(data):
  data.drop('Unnamed: 0', axis=1, inplace=True)

## Check which columns have empty values

In [62]:
def handle_missing_values(data):
  columns_with_null = data.isnull().sum() > 0
  columns_with_null = columns_with_null[columns_with_null].index

  for column in columns_with_null:
    data[column].fillna(data[column].mean(), inplace=True)

  # for column in columns_with_null:
  #   correlation = data_without_types.corr()[column]
  #   column_to_average = correlation[correlation.abs() > 0.5].index
  #   data[column].fillna(data[column_to_average].mean(axis=1), inplace=True)

## Handle outliers

In [63]:
def handle_outlying_values(data):
  data_without_types = data.drop(columns=['Type'])
  for column in data_without_types.columns:
    data[column] = data[column].clip(lower=data[column].quantile(0.05), upper=data[column].quantile(0.95))

In [64]:
def visualise_outlying_values(data):
    num_features = len(data.columns)
    cols = 4  
    rows = (num_features + cols - 1) // cols

    fig, axes = plt.subplots(rows, cols, figsize=(15, 4 * rows))
    axes = axes.flatten() 
    
    for i, feature in enumerate(data.columns):
        sns.boxplot(ax=axes[i], x=data[feature])
        axes[i].set_title(f'Box Plot of {feature}')
        axes[i].set_xlabel(feature)

    plt.tight_layout()
    plt.show()

## Normalise data

In [65]:
def normalise_data(data):
  data_without_types = data.drop(columns=['Type'])
  data_without_types = (data_without_types - data_without_types.mean()) / data_without_types.std()
  data[data_without_types.columns] = data_without_types

## Encode types

In [66]:
def encode_categorical_data(data):
    type_encodings = {
        'REGULAR': 0,
        'MAMRA': 1,
        'SANORA': 2
    }
    data['Type'] = data['Type'].map(type_encodings)

# Perform data preprocessing

In [67]:
def perform_preprocessing():
  data = read_in_csv('../Data/Almond.csv')
  drop_columns(data)
  handle_missing_values(data)
  handle_outlying_values(data)
  normalise_data(data)
  encode_categorical_data(data)

  return data