# Comparing human lung data to mouse tissues
Variations tested:
* Based on protein abundance
* Normalizing all data together
* iBAQ abundance values

Variations to test:
* Based on peptide abundance
* Normalizing mouse and human data separately
* LFQ abundance values

In [50]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [51]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
import pandas as pd
from sklearn.decomposition import PCA, NMF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

## Load mouse data

In [52]:
mouse_protein_file = "D:\proteinGroups.txt"

mouse_protein_df = mq.load_df(mouse_protein_file)
mouse_protein_df = mq.clean_weakly_identified(mouse_protein_df)
mouse_protein_df = mq.remove_dup_proteinIDs(mouse_protein_df)

mouse_iBAQ_df = mq.slice_by_column(mouse_protein_df, 'protein', 'iBAQ ')
mouse_LFQ_df = mq.slice_by_column(mouse_protein_df, 'protein', 'LFQ')

mouse_iBAQ_df.columns = cu.rename_columns(mouse_iBAQ_df, 'Adult', 'Mouse')
mouse_LFQ_df.columns = cu.rename_columns(mouse_LFQ_df, 'Adult', 'Mouse')

mouse_groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
mouse_organ_to_columns = {}
mouse_organ_counts = {} 

mouse_iBAQ_df['Majority protein IDs'] = mouse_iBAQ_df['Majority protein IDs'].str[:-6] # strip off '_Mouse'
mouse_LFQ_df['Majority protein IDs'] = mouse_LFQ_df['Majority protein IDs'].str[:-6] # strip off '_Mouse'
mouse_iBAQ_df.set_index('Majority protein IDs', inplace = True)
mouse_LFQ_df.set_index('Majority protein IDs', inplace = True)

## Load human data

* Human dataset info:
    * Instrument: QExactHF03
    * Separation Type: LC-Waters-Formic_3hr
    * Tool: MSGFPlus_MzMl
    * Jobs: 1498824-1498852
    * Param file: MSGFDB_PartTryp_MetOx_StatCysAlk_10ppmParTol.txt
    * Unlabelled samples

In [53]:
human_lung_protein_file = r'F:\Human_Lung_Raw_Files\LungMAP\combined\txt\human_lung_proteinGroups.txt'
human_groups = ['Human_Lung']

human_lung_df = mq.load_df(human_lung_protein_file)
human_lung_df = mq.clean_weakly_identified(human_lung_df)
human_lung_df = mq.remove_dup_proteinIDs(human_lung_df)
        
human_lung_iBAQ_df = mq.slice_by_column(human_lung_df, 'protein', 'iBAQ ') 
human_lung_LFQ_df = mq.slice_by_column(human_lung_df, 'protein', 'LFQ')
    
human_lung_organ_columns = {}
human_lung_organ_counts = {} 

human_lung_iBAQ_df['Majority protein IDs'] = human_lung_iBAQ_df['Majority protein IDs'].str[:-6]
human_lung_LFQ_df['Majority protein IDs'] = human_lung_LFQ_df['Majority protein IDs'].str[:-6]
human_lung_iBAQ_df.set_index('Majority protein IDs', inplace = True)
human_lung_LFQ_df.set_index('Majority protein IDs', inplace = True)

## Load human-mouse correspondance data

In [54]:
mapping_file = r'D:\Human_Mouse_Mapping.txt'
mapping_df = pd.read_csv(mapping_file, usecols=['Matched Term', 'Symbol', 'Species'], sep='\t', lineterminator='\r', encoding = 'latin1')
mapping_df = mapping_df.replace(r'\n','', regex=True)

# Filter out entries not containing human in the "Species" column
mapping_df = mapping_df[mapping_df['Species'].isnull() | mapping_df['Species'].str.contains('Human')]
mapping_df.set_index('Matched Term', inplace=True)
mapping_df.drop(['Species'], axis=1, inplace=True)

mapping_df['Symbol'].replace(to_replace=' (includes others)', value='', inplace=True) # remove trailing comments

In [55]:
#########################
#
# Change mouse proteinIDs to common symbol
#
#########################

mouse_proteins = mouse_iBAQ_df.index.values.tolist()
human_proteins = human_lung_iBAQ_df.index.values.tolist()
raw_mappings = mapping_df.to_dict('index') # {mouse protein: {'Symbol': common protein}}
mappings = {}

# Break up rows with multiple mouse proteins
for old_key, val in raw_mappings.items():
    keys = old_key.split()
    for new_key in keys:
        mappings[new_key] = raw_mappings[old_key]
        
mouse_iBAQ_df.reset_index(inplace=True)

for protein in mouse_proteins:
    if protein not in human_proteins:
        to_replace = protein + '_MOUSE'
        if to_replace in mappings:
            mapping = mappings[to_replace]
            new_sym = mapping['Symbol']
            mouse_iBAQ_df.replace(protein, new_sym, inplace=True)
        
mouse_iBAQ_df.set_index('Majority protein IDs', inplace=True)

In [56]:
print(mouse_iBAQ_df.head())

                      iBAQ Mouse_04_Liver  iBAQ Mouse_05_Liver  \
Majority protein IDs                                             
1433B                          80377000.0          106810000.0   
1433E                         251680000.0          225180000.0   
1433F                          32883000.0           46963000.0   
1433G                         175610000.0          166310000.0   
1433S                          53834000.0           62327000.0   

                      iBAQ Mouse_06_Liver  iBAQ Mouse_07_Brain  \
Majority protein IDs                                             
1433B                         129430000.0         6.599400e+08   
1433E                         266450000.0         1.231800e+09   
1433F                          44594000.0         7.019100e+08   
1433G                         193140000.0         1.754000e+09   
1433S                          93074000.0         5.072200e+08   

                      iBAQ Mouse_07_Heart  iBAQ Mouse_07_Kidney  \
Majorit

## Combine data 

### Normalize Separately 

### Normalize Together 

In [57]:
#########################
#
# Join mouse data to human data
#
#########################

combined_df = mouse_iBAQ_df.join(human_lung_iBAQ_df)

all_organs = ['Mouse.*Brain', 'Mouse.*Heart', 'Mouse.*Kidney', 'Mouse.*Liver', 'Mouse.*Lung', 'Human_Lung']
organs_to_columns = {}
organs_to_observed_counts = {}

combined_df = mq.filter_low_observed(combined_df, all_organs, organs_to_columns, organs_to_observed_counts)
mq.log2_normalize(combined_df)
mq.median_normalize(combined_df)
combined_df = mq.reorder_columns(combined_df, all_organs, organs_to_columns)

  df.iloc[:,:] = np.log2(df.iloc[:,:])


## Plots: Normalized Boxplot, PCA, Pearson matrix

In [58]:
base_dir = r'D:\Images\Classifier\\'
combined_dir = base_dir + 'Human_Lung_Mouse_Tissues_'
combined_color_mapping = mq.map_colors(all_organs, organs_to_columns)

mq.make_seaborn_boxplot(combined_df, combined_dir, 'Median Normalized Boxplot', combined_color_mapping)

combined_df = mq.impute_missing(combined_df)

all_columns = combined_df.columns.values.tolist()

In [59]:
combined_pca, combined_pca_data = mq.do_pca(combined_df, 'protein')

combined_per_var, combined_labels = mq.make_scree_plot(combined_pca, combined_dir)
mq.draw_pca_graph(all_columns, combined_pca_data, combined_dir, combined_color_mapping, combined_per_var, combined_labels)

In [60]:
mq.make_pearson_matrix(combined_df, combined_dir, dimensions=(20,15))

## Classifiers 

In [61]:
#########################
#
# Split off mouse data for training and human data for testing
#
#########################

human_lung_cols = human_lung_iBAQ_df.columns.values.tolist()
mouse_cols = mouse_iBAQ_df.columns.values.tolist()

mouse_data = combined_df[mouse_cols].T
human_lung_data = combined_df[human_lung_cols].T

In [62]:
mouse_organs_to_columns = {k:v for (k,v) in organs_to_columns.items() if 'Mouse' in k}
human_organs_to_columns = {k:v for (k,v) in organs_to_columns.items() if 'Human' in k}

In [63]:
#########################
#
# Get mouse (training) labels and human (test) labels
#
#########################

mouse_labels = cu.get_labels(mouse_data, mouse_cols, mouse_organs_to_columns)
mouse_labels = [label.replace('Mouse.*', '') for label in mouse_labels]

human_lung_labels = cu.get_labels(human_lung_data, human_lung_cols, human_organs_to_columns)
human_lung_labels = [label.replace('Human_', '') for label in human_lung_labels]

### Decision Tree

In [64]:
dt = cu.decisiontree_model_crossval(mouse_data, mouse_labels, 4)

Scores: [ 0.9  0.9  1.   1. ]
Accuracy: 0.95 (+/- 0.10)


In [65]:
dt_pred = cu.make_test_prediction(dt, human_lung_data, human_lung_labels)

print("\n")
cu.show_prediction_probabilities(dt, human_lung_data, 0)

score 1.0
pred ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


Prediction probabilities for sample:
Brain : 0.0
Heart : 0.0
Kidney : 0.0
Liver : 0.0
Lung : 1.0


### Random Forest

In [66]:
rf = cu.randomforest_model_crossval(mouse_data, mouse_labels, 4)

Scores: [ 1.  1.  1.  1.]
Accuracy: 1.00 (+/- 0.00)


In [67]:
rf_pred = cu.make_test_prediction(rf, human_lung_data, human_lung_labels)

print("\n")
cu.show_prediction_probabilities(rf, human_lung_data, 0)

score 0.931034482759
pred ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Liver'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Liver' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


Prediction probabilities for sample:
Brain : 0.2
Heart : 0.2
Kidney : 0.1
Liver : 0.1
Lung : 0.4


### KNN

In [68]:
knn = cu.knn_model_crossval(mouse_data, mouse_labels, 4)

Scores: [ 1.  1.  1.  1.]
Accuracy: 1.00 (+/- 0.00)


In [69]:
knn_pred = cu.make_test_prediction(knn, human_lung_data, human_lung_labels)

print("\n")
cu.show_prediction_probabilities(knn, human_lung_data, 4)

score 1.0
pred ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


Prediction probabilities for sample:
Brain : 0.0
Heart : 0.0
Kidney : 0.0
Liver : 0.0
Lung : 1.0


### Naive Bayes

In [70]:
gnb = cu.bayes_gaussian_model_crossval(mouse_data, mouse_labels, 4)

Scores: [ 0.6  0.6  0.6  0.4]
Accuracy: 0.55 (+/- 0.17)


In [71]:
gnb_pred = cu.make_test_prediction(gnb, human_lung_data, human_lung_labels)

print("\n")
cu.show_prediction_probabilities(gnb, human_lung_data, 0)

score 1.0
pred ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


Prediction probabilities for sample:
Brain : 0.0
Heart : 0.0
Kidney : 0.0
Liver : 0.0
Lung : 1.0


### SVC variations

In [72]:
models = cu.SVC_models_crossval(mouse_data, mouse_labels, 4)

Scores: [ 1.  1.  1.  1.]
Accuracy: 1.00 (+/- 0.00)
Scores: [ 1.  1.  1.  1.]
Accuracy: 1.00 (+/- 0.00)
Scores: [ 0.2  0.2  0.2  0.2]
Accuracy: 0.20 (+/- 0.00)
Scores: [ 1.  1.  1.  1.]
Accuracy: 1.00 (+/- 0.00)


In [73]:
svc_pred = cu.make_test_prediction(models[0], human_lung_data, human_lung_labels)

print("\n")
cu.show_prediction_probabilities(models[0], human_lung_data, 0)

score 1.0
pred ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


Prediction probabilities for sample:
Brain : 0.106191798979
Heart : 0.0516533781619
Kidney : 0.17164102282
Liver : 0.0809096081987
Lung : 0.58960419184


## Feature Selection 

* SelectKBest
* SelectPercentile
* Recursive elimination
* SelectFromModel

* Feature selection + Transformation + Classifier --> Pipeline
* Grid Search for best hyperparameters

### SelectKBest, SelectPercentile

In [75]:
from sklearn.feature_selection import SelectKBest, SelectPercentile

print('Original data:', mouse_data.shape)

kbest_data = SelectKBest(k=25).fit_transform(mouse_data, mouse_labels)
print('SelectKBest:', kbest_data.shape)

percentile_data = SelectPercentile(percentile=10).fit_transform(mouse_data, mouse_labels)
print('SelectPercentile:', percentile_data.shape)

Original data: (30, 2218)
SelectKBest: (30, 25)
SelectPercentile: (30, 222)


### Select From Model
* Classifier computes feature importances and discards irrelevant features

In [88]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

etc = ExtraTreesClassifier()
etc = etc.fit(mouse_data, mouse_labels)

model = SelectFromModel(etc, prefit=True)
from_model_data = model.transform(mouse_data)
print('Select From Model:', from_model_data.shape)

Select From Model: (30, 40)


### Pipelines
* Chain together feature elimination, reduction, and classification

In [41]:
anova_filter = SelectPercentile(percentile=10)
clf = KNeighborsClassifier()

anova_knn_pipeline = Pipeline([('anova', anova_filter), 
                               ('pca', PCA()),
                               ('knn', clf)])

anova_knn_pipeline.fit(mouse_data, mouse_labels)
pipeline_pred = cu.make_test_prediction(anova_knn_pipeline, human_lung_data, human_lung_labels)

score 1.0
pred ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


In [94]:
model_filter = SelectFromModel(etc)
clf = KNeighborsClassifier()

model_knn_pipeline = Pipeline([('anova', model_filter), 
                               ('pca', PCA()),
                               ('knn', clf)])

model_knn_pipeline.fit(mouse_data, mouse_labels)
pipeline_pred = cu.make_test_prediction(model_knn_pipeline, human_lung_data, human_lung_labels)

score 0.965517241379
pred ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Brain' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


### Grid Search for best parameters

In [27]:
from sklearn import grid_search
from sklearn.svm import SVC

parameters = {'kernel': ('linear', 'rbf', 'poly'), 
              'C':[1.5, 10, 100, 1000]}
svr = SVC()
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(mouse_data, mouse_labels)

clf.best_params_

{'C': 1.5, 'kernel': 'linear'}

#### SVC Grid Search

In [28]:
SVC_grid = cu.svc_grid_search(4, 1)

SVC_grid.fit(mouse_data, mouse_labels)

print('Best SVC parameters:\n', SVC_grid.best_params_)
print('\nBest Cross-Validation score:\n', SVC_grid.best_score_)
#print('\nBest Estimator:\n', SVC_grid.best_estimator_)

Best SVC parameters:
 {'classify__C': 1, 'classify__kernel': 'linear', 'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'reduce_dim__n_components': 2}

Best Cross-Validation score:
 1.0


In [29]:
cu.show_prediction_probabilities(SVC_grid, human_lung_data, 0)

Prediction probabilities for sample:
Brain : 0.0943720780692
Heart : 0.0350227142548
Kidney : 0.150501235509
Liver : 0.0524669319492
Lung : 0.667637040218


#### KNN Grid Search

In [30]:
knn_grid = cu.knn_grid_search(4, 1)

knn_grid.fit(mouse_data, mouse_labels)

print('Best KNN parameters:\n', knn_grid.best_params_)
print('\nBest Cross-Validation score:\n', knn_grid.best_score_)

Best KNN parameters:
 {'classify__n_neighbors': 1, 'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'reduce_dim__n_components': 2}

Best Cross-Validation score:
 1.0


In [31]:
cu.show_prediction_probabilities(knn_grid, human_lung_data, 0)

Prediction probabilities for sample:
Brain : 0.0
Heart : 0.0
Kidney : 0.0
Liver : 0.0
Lung : 1.0


#### Random Forest Grid Search

In [25]:
rf_grid = cu.rf_grid_search(4, 1)

rf_grid.fit(mouse_data, mouse_labels)

print('Best Random Forest parameters:\n', rf_grid.best_params_)
print('\nBest Cross-Validation score:\n', rf_grid.best_score_)

Best Random Forest parameters:
 {'classify__min_samples_split': 2, 'classify__n_estimators': 25, 'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'reduce_dim__n_components': 2}

Best Cross-Validation score:
 1.0


In [29]:
# Get standard deviation for best model
rf_grid.cv_results_['std_test_score'][rf_grid.best_index_]

0.0

In [33]:
cu.show_prediction_probabilities(rf_grid, human_lung_data, 0)

Prediction probabilities for sample:
Brain : 0.24
Heart : 0.16
Kidney : 0.0
Liver : 0.0
Lung : 0.6


## Highly expressed proteins
* Top n proteins contributing to PCA
* Tukey test for each organ's top proteins
* Top proteins by mean abundance per organ

In [34]:
tukeydict = mq.make_tukey_dict(mouse_data.T, mouse_labels)

In [35]:
top_brain_proteins = mq.top_n_enriched(5, 'Kidney', tukeydict)
#print(top_brain_proteins)
print(list(x[0] for x in top_brain_proteins))

['FOLR1', 'PLCG2', 'CATC', 'TPPP', 'NDRG1']


In [36]:
test_dict = cu.get_descending_abundances(mouse_data.T, mouse_labels)

top_liver_proteins = cu.n_most_abundant(test_dict, 'Liver', 5)
print(top_liver_proteins)

['HBA' 'FABPL' 'SODC' 'ACBP' 'ASSY']


In [67]:
data = {'Protein': ['A', 'B', 'C'],
 'Liver1': [1, 1, 5],
 'Liver2': [1, 2, 6],
 'Heart1': [10, 1, 1],
 'Heart2': [8, 2, 2],
 'Lung': [5, 4, 5]}

test_df = pd.DataFrame(data, columns=['Protein', 'Liver1', 'Liver2', 'Heart1', 'Heart2', 'Lung'])
test_df.set_index('Protein', inplace=True)
test_df.columns = ['Liver', 'Liver', 'Heart', 'Heart', 'Lung']

liver_df = test_df['Liver']
sorted_by_abundance = liver_df.mean(axis=1).sort_values(ascending=False)

print(sorted_by_abundance)

# Top 2 most abundant:
sorted_by_abundance.index.values[:2]

Protein
C    5.5
B    1.5
A    1.0
dtype: float64


array(['C', 'B'], dtype=object)