In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings("ignore")



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv("/kaggle/input/visemtracking/semen_analysis_data_Train.csv", low_memory=False, encoding ='utf8',sep=",")

df.head()

In [None]:
df.columns.tolist()

In [None]:

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [None]:

def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings

In [None]:

def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs

In [None]:


def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [None]:
df = df.rename(columns={'Sperm concentration (x10⁶/mL)':'concentration', 'Sperm vitality (%)': 'vitality', 'Normal spermatozoa (%)': 'normal', 'Tail defects (%)': 'tail', 'Immotile sperm (%)': 'immotile', 'Progressive motility (%)': 'progressiv', 'High DNA stainability, HDS (%)': 'hds', 'DNA fragmentation index, DFI (%)': 'dfi', 'Non progressive sperm motility (%)': 'non_progressive'})

In [None]:

features = [
    "concentration",
    "vitality",
    "progressiv",
    "dfi",
]

print("Correlation with Immotility:\n")
print(df[features].corrwith(df.immotile))



In [None]:

X = df.copy()
y = X.pop("immotile")
X = X.loc[:, features]


pca, X_pca, loadings = apply_pca(X)
print(loadings)

In [None]:

X = df.copy()
y = X.pop("immotile")


X["Feature1"] = X.concentration + X.vitality
X["Feature2"] = X.progressiv * X.dfi 

score = score_dataset(X, y)
print(f"Your score: {score:.5f} RMSLE")

#Solution 2: Use components

In [None]:

X = df.copy()
y = X.pop("immotile")


X = X.join(X_pca)

score = score_dataset(X, y)
print(f"Your score: {score:.5f} RMSLE")

In [None]:

sns.catplot(
    y="value",
    col="variable",
    data=X_pca.melt(),
    kind='boxen',
    sharey=False,
    col_wrap=2,
);

In [None]:

# You can change PC1 to PC2, PC3, or PC4
component = "PC1"

idx = X_pca[component].sort_values(ascending=False).index
df.loc[idx, ["immotile", "non_progressive", "hds"] + features]

In [None]:


component = "PC4"

idx = X_pca[component].sort_values(ascending=False).index
df.loc[idx, ["immotile", "non_progressive", "hds"] + features]