# Exploratory Data Analysis (EDA)

In [None]:

import sys
import os

# Get the current working directory
current_working_directory = os.getcwd()

# Go up one level from the current working directory
parent_directory = os.path.join(current_working_directory, '..')

# Add the parent directory to sys.path
sys.path.append(parent_directory)

os.getcwd()

## Load data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
from src.data.data_loader import LocalDataLoader, MockDataLoader
x, y = LocalDataLoader().load_raw_data()
training_data = pd.concat([x, y], axis=1)
# Display basic information
print(training_data.info())
training_data.head()

## Nan analysis
What fields contains NaN values? How large part are they? How to handle them?

In [None]:
# Find amount of nans in each column
# amount / total 
nans = pd.DataFrame()
nans["amount_nans"] = training_data.isnull().sum()
nans["%"] = (nans["amount_nans"] / training_data.shape[0]) 
nans = nans[nans["amount_nans"] > 0] 
print(nans)

### Strategies to deal with NaN values

## Duplicates Analysis
Are there any duplicates in the dataset and in that case how many are there of them? 

In [None]:
duplicate_count = training_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")


## Outlier detection

# Dimensionality reduction

## Curse of dimensionality
Numerous machine learning challenges encompass training instances that are characterized by thousands, if not millions, of features. The sheer volume of features not only considerably slows down the training process but also complicates the task of identifying effective solutions, this is known as the **curse of dimensionality**.

Dimensionality reduction is a technique to reduce the number of features in the dataset. 
Dimensionality reduction, while beneficial in speeding up training processes, inherently involves a compromise similar to image compression (e.g., converting an image to JPEG format), where some level of information loss occurs. This trade-off can potentially affect the performance of your model, making it slightly less accurate. Additionally, implementing dimensionality reduction adds complexity to your data processing pipelines.

It's advisable to first attempt training your model with the complete dataset. Dimensionality reduction should be considered a secondary option if the training process proves to be excessively time-consuming. In certain scenarios, reducing the dataset's dimensions may actually improve model performance by eliminating noise and irrelevant details.

## Projection methods

### Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

percentage_variance = 0.95
pca = PCA()
pca.fit(x, y)
cumsum = np.cumsum(pca.explained_variance_ratio_)
dimensions = np.argmax(cumsum >= percentage_variance) + 1
print(f"Number of dimensions to keep {percentage_variance}% of the variance: {dimensions}")

Note: *If the dataset is to large to fit into memory run the IncrementalPCA algorithm. Sklearn has a built-in class for this purpose.*

In [None]:
total_amount_of_dimensions = x.shape[1]
plt.plot(cumsum, linewidth=3)
plt.axis([0, total_amount_of_dimensions, 0, 1])
plt.plot([dimensions, dimensions], [0, percentage_variance], "k:")
plt.plot([0, dimensions], [percentage_variance, percentage_variance], "k:")
plt.plot(dimensions, percentage_variance, "ko")
plt.title("Explained variance as a function of the number of dimensions")
plt.xlabel("Dimensions")
plt.ylabel("Explained Variance")
plt.grid(True)
plt.show()

## Manifold Learning
Manifold learning is an approach to non-linear dimensionality reduction. Algorithms for this task are based on the idea that the dimensionality of many data sets is only artificially high.

In [None]:
def plot_reduced_data(X_reduced: pd.DataFrame, title: str, y: pd.DataFrame=y):
    
    plt.figure(figsize=(11,4))
    plt.title(title, fontsize=15)
    print(X_reduced.shape)
    if X_reduced.shape[1] == 1:
        plt.plot(X_reduced, np.zeros(X_reduced.shape), "b.")
    if X_reduced.shape[1] == 2:
        plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y)
    if X_reduced.shape[1] > 2:
        plt.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y)
    plt.xlabel("$z_1$", fontsize=18)
    plt.ylabel("$z_2$", fontsize=18, rotation=0)
    plt.grid(True)
    plt.show()


### Locally Linear Embedding (LLE)

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)
X_reduced = lle.fit_transform(x)
plot_reduced_data(X_reduced, "Locally Linear Embedding (LLE)")

### Multidimensional Scaling (MDS)

NB: This might be slow to run

In [None]:
from sklearn.manifold import MDS

mds = MDS(n_components=2, random_state=42)
X_reduced_mds = mds.fit_transform(x)
plot_reduced_data(X_reduced_mds, "Multi-Dimensional Scaling (MDS)")

### IsoMap

In [None]:
from sklearn.manifold import Isomap

isomap = Isomap(n_components=2)
X_reduced_isomap = isomap.fit_transform(x)
plot_reduced_data(X_reduced_isomap, "Isomap")

### T-distributed Stochastic Neighbor Embedding (t-SNE)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_reduced_tsne = tsne.fit_transform(x)
plot_reduced_data(X_reduced_tsne, "t-SNE")

### Linear Discriminant Analysis (LDA)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(x, y)
X_reduced_lda = lda.transform(x)
plot_reduced_data(X_reduced_lda, "LDA")