In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
import xgboost as xgb
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import time

In [None]:
df = pd.read_excel('GTEx_pancreas_liver_images_liverfat_pancreasfat.xlsx', index_col=0)
cols = [
    'Sex',
    'Age.Bracket',
    'Hardy.Scale',
    'Pathology.Categories_pancreas',
    'Pathology.Categories_liver',
    'Fat.Percentage_liver',
    'Fat.Percentage_pancreas'
       ]
# df1 = df[df.columns[~df.columns.isin(cols)]]

In [None]:
cols_liver = [
    'Sex',
    'Age.Bracket',
    'Hardy.Scale',
    'Pathology.Categories_liver',
#     'Pathology.Notes_liver',
    'Fat.Percentage_liver',
       ]
df_liver = df[cols_liver]
df_liver.head()

In [None]:
# df0 = df_liver['Fat.Percentage_liver']
# df0 = pd.cut(df_liver['Fat.Percentage_liver'],bins=4, labels=False)
df0 = pd.qcut(df_liver['Fat.Percentage_liver'],4, labels=False)
df1 = pd.get_dummies(df_liver[['Sex','Age.Bracket']])
df2 = df_liver['Hardy.Scale'].str.get_dummies(sep='-').add_prefix('Hardy.Scale_')
df3 = df_liver['Pathology.Categories_liver'].str.get_dummies(sep=',').add_prefix('Pathology.Categories_liver_')
# df4 = df_liver['Pathology.Notes_liver'].str.get_dummies(sep=',').add_prefix('Pathology.Notes_liver_')
result = pd.concat([df1,df2,df3,df0], axis=1, sort=False)
result.head()

In [None]:
from sklearn.model_selection import train_test_split
X = result.drop('Fat.Percentage_liver', axis=1).values
y = result['Fat.Percentage_liver'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# df_liver['Fat.Percentage_liver'].sort_values().plot()
# df_liver['Fat.Percentage_liver'].hist(bins=100)

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(result.drop('Fat.Percentage_liver', axis=1).values)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
result['tsne-2d-one'] = tsne_results[:,0]
result['tsne-2d-two'] = tsne_results[:,1]

plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one",
    y="tsne-2d-two",
    hue="Fat.Percentage_liver",
    palette=sns.color_palette("hls", 4),
    data=result,
    legend="full",
    alpha=0.8
)

In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(result.drop('Fat.Percentage_liver', axis=1).values)
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
result['pca-one'] = pca_result[:,0]
result['pca-two'] = pca_result[:,1] 
# result['pca-three'] = pca_result[:,2]

plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-one",
    y="pca-two",
    hue="Fat.Percentage_liver",
    palette=sns.color_palette("hls", 4),
    data=result,
    legend="full",
    alpha=0.3
)

In [None]:
# Create correlation matrix
corr_matrix = result.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
result1 = result.drop(result[to_drop], axis=1)
result1

In [None]:
# Correlation Matrix Heatmap
f, ax = plt.subplots(figsize=(16, 9))
corr = result.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f',
                 linewidths=.05)
# # f.subplots_adjust(top=0.93)
# t= f.suptitle('Wine Attributes Correlation Heatmap', fontsize=14)

In [1]:
# import pyreadr
# result = pyreadr.read_r('countMatrixLiver.rds')
# df = result[None]

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

readRDS = robjects.r['readRDS']
df = readRDS('countMatrixLiver.rds')
df = pandas2ri.ri2py(df)

ModuleNotFoundError: No module named 'rpy2'

In [None]:
# df_test = pd.read_excel('GTEx_pancreas_liver_images_liverfat_pancreasfat_seq.xlsx', index_col=0)
# test_cols = [
#     'Sex',
#     'Age.Bracket',
#     'Hardy.Scale',
#     'Pathology.Categories_pancreas',
#     'Pathology.Categories_liver',
#     'Fat,Percentage_liver',
#     'Fat,Percentage_pancreas'
#        ]
# df_test = df_test[test_cols]

In [None]:
# categorical_features = [
#     'Sex',
#     'Age.Bracket',
#     'Hardy.Scale',
# ]

# fig, ax = plt.subplots(1, len(categorical_features), figsize=(16,9))
# for i, categorical_feature in enumerate(df[categorical_features]):
#     df[categorical_feature].value_counts().plot(kind = "bar", ax=ax[i]).set_title(categorical_feature)
#     plt.tight_layout()
# fig.show()

In [None]:
# categorical_features = [
#     'Pathology.Categories_pancreas',
#     'Pathology.Categories_liver'
# ]

# fig, ax = plt.subplots(1, len(categorical_features), figsize=(16,9))
# for i, categorical_feature in enumerate(df[categorical_features]):
#     df[categorical_feature].value_counts().plot(kind = "bar", ax=ax[i]).set_title(categorical_feature)
#     plt.tight_layout()
# fig.show()

In [None]:
# df[['Fat.Percentage_liver']].sort_values('Fat.Percentage_liver').plot(kind='bar', figsize=(16,9))

In [None]:
# df[['Fat.Percentage_pancreas']].sort_values('Fat.Percentage_pancreas').plot(kind='bar', figsize=(16,9))

In [None]:
# cols1 = [
#     'Sex',
#     'Age.Bracket',
#     'Hardy.Scale',
#     'Pathology.Categories_pancreas',
#     'Pathology.Categories_liver',
#        ]
# X_train = df1[cols1]
# y_train = df1['Fat.Percentage_liver']
# X_test = df_test[cols1]
# y_test = df_test['Fat,Percentage_liver']

In [None]:
# xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 5, alpha = 10, n_estimators = 10)
# xg_reg.fit(X_train,y_train)

In [None]:
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(X_train, y_train)
# clf.predict(X_test)