In [None]:
# Import PyTorch and related packages
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import NAdam, AdamW, Adam, SGD, RMSprop
from torch.utils.data import DataLoader, TensorDataset, Dataset

# Import packages for data manipulation and data splitting
import re
from fuzzywuzzy import process, fuzz

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.feature_selection import mutual_info_classif


# Downloading datasets, and loading
from sklearn.datasets import fetch_openml

In [None]:
# Ensure that floats are displayed with a decimal point
pd.options.display.float_format = '{:.1f}'.format

# Fetch the 'adult' dataset from OpenML
dataset = fetch_openml(name='SpeedDating', version=1)

# Create a Pandas DataFrame
df = pd.DataFrame(data=np.c_[dataset['data'], dataset['target']],
                  columns=dataset['feature_names'] + ['target'])

# Display the first few rows
print(df.head())

In [None]:
def print_unique_for_column(df, column):
    unique_values = df[column].unique()
    sum = df[column].nunique(dropna=False)
    print(f"Unique values in column '{column}' with sum of {sum} (including NaN): {unique_values}")

def print_unique(df=df):
    # Loop through each column and print unique values
    for column in df.columns:
        print_unique_for_column(df, column)

print_unique(df)

## Data preprocessing

In [None]:
# Column field has a lot of problems
print(f"Sum: {df['field'].nunique(dropna=True)}")
sorted_unique_values = sorted(df['field'].dropna().unique())
for value in sorted_unique_values:
    print(value)

In [None]:
# Regular expression to match ;, :, -, /, and everything within []
pattern = r'[;:\-\/]|\[.*?\]'

# Replace matched patterns with an empty string
df['field'] = df['field'].str.replace(pattern, ' ', regex=True)

# Remove phd and remove duplicate space
df['field'] = df['field'].str.replace('phd', '', flags=re.IGNORECASE, regex=True).str.replace(' +', ' ', regex=True)

# Replace shortened engg. with engineering
df['field'] = df['field'].str.replace('engg.', 'engineering', flags=re.IGNORECASE, regex=True)

# Change everything to lowercase and remove white space
df['field'] = df['field'].str.lower().str.strip()
df['race'] = df['race'].str.lower().str.strip()
df['race_o'] = df['race_o'].str.lower().str.strip()

In [None]:
# Define a function to replace close matches
def combine_similar(df, column, correct_value, threshold=90):
    # Get unique values
    unique_values = df[column].unique()
    
    # Find matches above the threshold
    matches = process.extract(correct_value, unique_values, limit=None, scorer=fuzz.token_sort_ratio)
    close_matches = [match[0] for match in matches if match[1] >= threshold]
    
    # Replace close matches with the correct value
    df[column] = df[column].apply(lambda x: correct_value if x in close_matches else x)

combine_similar(df, 'field', 'finance')
combine_similar(df, 'field', 'nutrition')
combine_similar(df, 'field', 'speech language pathology')
combine_similar(df, 'field', 'international affairs')
combine_similar(df, 'field', 'finance economics')
combine_similar(df, 'field', 'mathematic')

In [None]:
# Dropped from 259 to 203 nut including NaN
print(f"Sum: {df['field'].nunique(dropna=True)}")
sorted_unique_values = sorted(df['field'].dropna().unique())
for value in sorted_unique_values:
    print(value)

In [None]:
def try_convert_float(value):
    try:
        return float(value)
    except ValueError:
        return value

for column in df.columns:
  df[column] = df[column].apply(try_convert_float)

print_unique(df)

## Filling the NaN values

In [None]:
# Check how much of percentage is missing from every column
missing_values = (df.isnull().sum() / len(df)) * 100

for name, value in missing_values.items():
  print(f"{name}: {value:.2f}%")

In [None]:
columns_to_drop = ['has_null', 'wave', 'target']
df_features = df.drop(columns=columns_to_drop)

In [None]:
# This will select columns with data type 'object' or 'string'
string_columns = df.select_dtypes(include=['object', 'string'])

# Now, print these columns
print_unique(string_columns)

In [None]:
nominal_columns = ['gender', 'race', 'race_o', 'field']
ordinal_columns = string_columns.drop(columns=nominal_columns)

# Encode with one hot encoder
df_encoded = pd.get_dummies(df_features, columns=nominal_columns, drop_first=False)

# Encode with ordinal encoder

# Define the order of categories
order = {'[0-1]': 0.0, '[2-3]': 1.0, '[4-6]': 2.0, '[7-37]': 3.0,
         '[0-1]': 0.0, '[2-5]': 1.0, '[6-10]': 2.0,
         '[0-2]': 0.0, '[3-5]': 1.0, '[5-18]': 2.0,
         '[0-3]': 0.0, '[4-9]': 1.0, '[10-20]': 2.0,
         '[0-4]': 0.0, '[5-6]': 1.0, '[7-10]': 2.0,
         '[0-5]': 0.0, '[6-8]': 1.0, '[9-10]': 2.0, 
         '[0-15]': 0.0, '[16-20]': 1.0, '[21-100]': 2.0,
         '[-1-0]': 0.0, '[0-0.33]': 1.0, '[0.33-1]' : 2.0}

# Encode the data
for column in ordinal_columns:
    df_encoded[column] = df_encoded[column].map(order)

In [None]:
print_unique(df_encoded.select_dtypes(include=['object', 'string']))

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_encoded)
df_scaled = pd.DataFrame(df_scaled, columns=df_encoded.columns)

In [None]:
# Initialize the KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Fit the imputer to your data and transform it
imputed_data = imputer.fit_transform(df_scaled)

clean_data = pd.DataFrame(imputed_data, columns=df_encoded.columns)

In [None]:
X = clean_data
y = df['target']

mic = mutual_info_classif(X, y)

mic_series = pd.Series(mic, index=X.columns)
mic_series = mic_series.sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt

mic_series = mic_series[mic_series > 0.01]

mic_series.plot.bar(figsize=(15, 4))
plt.ylabel('Mutual Information Score')
plt.xlabel('Features')
plt.title('Mutual Information Scores')


In [None]:
clean_data = clean_data[mic_series.index.tolist()]
target_date = df['target']