# Developer Handbook - PCA_analysis.py
- modul contains a function to compute and plot PCA analysis from a data set selected by user/GUI
- list of functions:
    - perform_pca()
## Packages 

In [None]:
from tkinter import messagebox
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Function - perform_pca()
- parameter: data (contains data to cumpute PCA) with accsseion column 
1. check for data, otherwise errormessage via messagebox
- function actions embedded in `try:`/`except:` - block
    -expcept shows errormessage depending on error in console and with a messagebox
2. entries of datatype "numbers" in dataframe are selected and NaN are droped and result-dataframe is saved 
    - if new dataframe is empty an error message is shown via messagebox
3. creation of standardized data and saving into new variable: `StandardScaler()` creates a new object in which `fit_transform()` is used to learn and to transform the new data set
4. PCA-object with 2 components is initialised and used in the `fit_transform()` function to learn (calculate the two components) and transform (project data onto 2 components to reduce them) the data and create a pca_result
5. a dataframe of the pca_resul data ist created with columns "Accession", "Principal Component 1" and "Principal Component 2"
6. plotting the pca result based on the pca-dataframe with sns.scatterplot from `seaborn`

In [None]:
def perform_pca(data):
    if data is None:
        messagebox.showerror("Error", "Please select a file and load data first.")

    try:
        # Drop non-numeric columns and handle NaNs
        numeric_data = data.select_dtypes(include=['number']).dropna()
        
        # Überprüfen, ob Daten nach Bereinigung existieren
        if numeric_data.empty:
           print("Bereinigter Datensatz ist leer, keine Daten für PCA verfügbar.")
           messagebox.showerror("Error", "No numeric data available for PCA after cleaning.")
           return None
        
        # Standardize the data
        standardized_data = StandardScaler().fit_transform(numeric_data)
        
        # Perform PCA
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(standardized_data)

        # Create a DataFrame for PCA results
        pca_df = pd.DataFrame(data=pca_result, columns=['Principal Component 1', 'Principal Component 2'])
        pca_df['Accession'] = data['Accession']  # Add Accession back for labeling

        # Plot PCA results
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='Principal Component 1', y='Principal Component 2', data=pca_df)
        plt.title('PCA of Data')
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
                
        plt.tight_layout()
        plt.show()
    
    except Exception as e:
        print(f"Exception during PCA: {e}")  # Debugging-Ausgabe
        messagebox.showerror("Error", f"An error occurred during PCA: {str(e)}")
        