In [1]:
#author: Amelie Bauerdick
#Wabnitz Lab

# Import Packages

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score,mean_squared_error
import sklearn.cluster as cluster
from sklearn.cluster import KMeans 
import umap
import hdbscan

# Load CSV

In [None]:
data=pd.read_csv("path/data.csv")
data

In [None]:
list(data.columns)

# Rename Features

In [None]:
data.columns = data.columns.str.strip().str.replace(' ', '_')
data.columns = data.columns.str.strip().str.replace('&', 'and')
data.columns = data.columns.str.strip().str.replace('+', 'plus')
data.columns = data.columns.str.strip().str.replace('-', 'minus')
list(data.columns)

# Temporary Store Selected Features

In [None]:
feature= data[['feature']].copy()

# Drop Features

In [7]:
data = data.drop(columns=[
 'feature1',
 'feature2'
 ])

In [None]:
list(data.columns)

# Normalization

In [9]:
def minmax_norm(df):
  return (df - df.min()) / ( df.max() - df.min())

In [10]:
data= minmax_norm(data)

# DensMAP

In [11]:
reducer = umap.UMAP(densmap=True, 
                    dens_lambda=1, 
                    n_neighbors=20,
                    min_dist=0.5,
                    metric='euclidean')

In [None]:
%time embedding = reducer.fit_transform(data)
embedding.shape

In [13]:
x = embedding[:, 0]
y =  embedding[:, 1]

In [14]:
data['x']=x
data['y']=y

# Random Forest Regression for DensMAP-1

In [15]:
data1=data.copy()
data1=data1.drop('y',axis=1)

# Split Data

In [None]:
train_df, test_df = train_test_split(data1, test_size=0.2, random_state=42)

# dependent variable -> x
X_train = train_df.drop('x', axis=1)
y_train = train_df['x']
X_test = test_df.drop('x', axis=1)
y_test = test_df['x']

print("length of data for training:", len(X_train))
print("length of data for testing:", len(X_test))

# RandomForestRegressor

In [None]:
# run RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# predict dependent variable
y_pred = model.predict(X_test)

# r²-value calculation
r2 = r2_score(y_test, y_pred)
print("r² Score:", r2)

# Feature Importance

In [None]:
#save importance
importance = model.feature_importances_

# sort features according to importance
s_id = np.argsort(importance)
pos = np.arange(s_id.shape[0])

# MinMax scaling
scaler = MinMaxScaler()
importance_scaled = scaler.fit_transform(importance.reshape(-1, 1)).flatten()

#importance 
total_importance = np.sum(importance_scaled)
percentage_importance = (importance_scaled / total_importance) * 100

# show top ten
top_n = 10
s_id = s_id[-top_n:]
features = pd.DataFrame({'index1': np.array(X_train.columns)[s_id], 'importance_normalized': importance_scaled[s_id], 'percentage_importance': percentage_importance[s_id]})

features

# Plot Feature Importance

In [None]:
# rename features
row_labels = {
    'feature1': 'feature 1',
    'feature2': 'feature 2',
}

features['index1'] = features['index1'].map(row_labels)

ax = features.plot.bar(x='index1', y='importance_normalized', color='darkgray', legend=False, figsize=(10,5), width=0.8, fontsize=20)
plt.xlabel('')
plt.ylabel('Importance', fontsize=20)

# position percentage values
for i, v in enumerate(features['percentage_importance']):
    if features['importance_normalized'][i] + 0.01 > 1.1:
        text_height = 1
    else:
        text_height = features['importance_normalized'][i] + 0.01
    ax.text(i, text_height, f'{v:.1f}%', ha='center', va='bottom', fontsize=16, color='black')


plt.title(f'Top 10 Features DensMAP-1', fontsize=30, loc='left')
# add r2 score
#plt.text(0.98, 1.03, f'r² score: {r2:.2f}', horizontalalignment='right', verticalalignment='center', transform=ax.transAxes, fontsize=12, fontweight='normal')
plt.ylim(0, 1.1) 
plt.xticks(rotation=45, ha='right')

plt.savefig('png/DensMAP1.png', dpi=300, bbox_inches = 'tight')

plt.show()

# Random Forest Regression for DensMAP-2

In [20]:
data2=data.copy()
data2=data2.drop('x',axis=1)

# Split Data

In [None]:
train_df, test_df = train_test_split(data2, test_size=0.2, random_state=42)

# dependent variable -> y
X_train = train_df.drop('y', axis=1)
y_train = train_df['y']
X_test = test_df.drop('y', axis=1)
y_test = test_df['y']

print("length of data for training:", len(X_train))
print("length of data for testing:", len(X_test))

# RandomForestRegressor

In [None]:
# run RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# predict dependent variable
y_pred = model.predict(X_test)

# r²-value calculation
r2 = r2_score(y_test, y_pred)
print("r² score:", r2)

# Feature Importance

In [None]:
#save importance
importance = model.feature_importances_

# sort features according to importance
s_id = np.argsort(importance)
pos = np.arange(s_id.shape[0])

# MinMax scaling
scaler = MinMaxScaler()
importance_scaled = scaler.fit_transform(importance.reshape(-1, 1)).flatten()

#importance 
total_importance = np.sum(importance_scaled)
percentage_importance = (importance_scaled / total_importance) * 100

# show top ten
top_n = 10
s_id = s_id[-top_n:]
features = pd.DataFrame({'index1': np.array(X_train.columns)[s_id], 'importance_normalized': importance_scaled[s_id], 'percentage_importance': percentage_importance[s_id]})

features

# Plot Feature Importance

In [None]:
# rename features
row_labels = {
    'feature1': 'feature 1',
    'feature2': 'feature 2'
}

features['index1'] = features['index1'].map(row_labels)

ax = features.plot.bar(x='index1', y='importance_normalized', color='darkgray', legend=False, figsize=(10,5), width=0.8, fontsize=20)
plt.xlabel('')
plt.ylabel('Importance', fontsize=20)

# position percentage values
for i, v in enumerate(features['percentage_importance']):
    if features['importance_normalized'][i] + 0.01 > 1.1:
        text_height = 1
    else:
        text_height = features['importance_normalized'][i] + 0.01
    ax.text(i, text_height, f'{v:.1f}%', ha='center', va='bottom', fontsize=16, color='black')

plt.title(f'Top 10 Features DensMAP-2', fontsize=30, loc='left')
# add r2 score
#plt.text(0.98, 1.03, f'r² score: {r2:.2f}', horizontalalignment='right', verticalalignment='center', transform=ax.transAxes, fontsize=12, fontweight='normal')
plt.ylim(0, 1.1) 
plt.xticks(rotation=45, ha='right')

plt.savefig('png/DensMAP2.png', dpi=300, bbox_inches = 'tight')

plt.show()

# Concatenate Features

In [None]:
frames=[data,feature]
data=pd.concat(frames, axis=1)
data

# DensMAP

In [26]:
data.to_csv("path/DMAP.csv",index=False)