In [1]:
import gzip
import scanpy as sc
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix



In [2]:
import matplotlib.pyplot as plt

In [3]:
expression_file = 'C:/Users/ssaba/OneDrive/Documents/Big Projects/scRNA Python/Datasets/GSE75688_GEO_processed_Breast_Cancer_raw_TPM_matrix.txt.gz'
sample_info_file = 'C:/Users/ssaba/OneDrive/Documents/Big Projects/scRNA Python/Datasets/GSE75688_final_sample_information.txt.gz'

In [4]:
with gzip.open(expression_file, 'rt') as f:
    expression_data = pd.read_csv(f, delimiter='\t', index_col=0)

In [5]:
pat = "BC03|BC07"
exp_data = expression_data.iloc[:,expression_data.columns.str.contains(pat)]
mask = ~exp_data.columns.str.contains("Pooled")
exp_data = exp_data.loc[:, mask]
target_col = {col:"TLN" if "LN" in col else "Tumour" for col in exp_data.columns}

In [6]:
adata = sc.AnnData(exp_data.T)

In [7]:
adata.obs["Target"] = target_col.values()

In [8]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata)
sc.tl.pca(adata, n_comps=10)

In [9]:
sc.pp.highly_variable_genes(adata,subset=True, n_top_genes=2000)

In [10]:
adata.var_names

Index(['ENSG00000001561.6', 'ENSG00000001631.10', 'ENSG00000002330.9',
       'ENSG00000003056.3', 'ENSG00000003436.10', 'ENSG00000004799.7',
       'ENSG00000004975.7', 'ENSG00000005100.8', 'ENSG00000005243.5',
       'ENSG00000005700.10',
       ...
       'ENSG00000272966.1', 'ENSG00000272990.1', 'ENSG00000273221.1',
       'ENSG00000273259.1', 'ENSG00000273320.1', 'ENSG00000273345.1',
       'ENSG00000273424.1', 'ENSG00000273449.1', 'ENSG00000273451.1',
       'ENSG00000273489.1'],
      dtype='object', name='gene_id', length=2000)

In [11]:
adata.obs

Unnamed: 0,Target
BC07_Tumor,Tumour
BC03_03,Tumour
BC03_06,Tumour
BC03_09,Tumour
BC03_11,Tumour
...,...
BC07LN_90,TLN
BC07LN_91,TLN
BC07LN_94,TLN
BC07LN_95,TLN


In [12]:
adata.X

array([[ 9.11296546e-02,  5.77406412e-01,  3.65301351e-03, ...,
         4.89703847e+00,  1.02973417e-02,  1.45418851e-01],
       [-3.08470439e-01, -4.90250857e-01, -3.49352189e-01, ...,
        -9.20487245e-02, -1.78026939e-01, -3.54455370e-01],
       [-3.08470439e-01, -4.90250857e-01, -3.49352189e-01, ...,
        -9.20487245e-02, -1.78026939e-01,  2.59851812e+00],
       ...,
       [-3.08470439e-01, -4.80544696e-01, -2.81293095e-01, ...,
        -9.20487245e-02, -1.78026939e-01, -3.72283240e-01],
       [-3.08470439e-01,  2.77465463e-01, -3.49352189e-01, ...,
        -9.20487245e-02, -1.78026939e-01, -3.40935895e-01],
       [-3.08470439e-01, -4.90250857e-01, -3.49352189e-01, ...,
        -9.20487245e-02, -1.78026939e-01, -3.72283240e-01]])

In [13]:
df = pd.DataFrame(adata.X, columns = adata.var_names, index = adata.obs_names)

In [14]:
df.shape

(197, 2000)

In [15]:
import seaborn as sns
all_values = gene_expression.values.flatten()

# Plot the overall distribution
sns.histplot(all_values, bins=100, kde=True)
plt.title('Overall Distribution of Gene Expression Values')
plt.xlabel('Expression Level')
plt.ylabel('Frequency')
plt.show()

NameError: name 'gene_expression' is not defined

In [15]:
df

gene_id,ENSG00000001561.6,ENSG00000001631.10,ENSG00000002330.9,ENSG00000003056.3,ENSG00000003436.10,ENSG00000004799.7,ENSG00000004975.7,ENSG00000005100.8,ENSG00000005243.5,ENSG00000005700.10,...,ENSG00000272966.1,ENSG00000272990.1,ENSG00000273221.1,ENSG00000273259.1,ENSG00000273320.1,ENSG00000273345.1,ENSG00000273424.1,ENSG00000273449.1,ENSG00000273451.1,ENSG00000273489.1
BC07_Tumor,0.091130,0.577406,0.003653,1.370028,0.202626,0.176885,-0.124550,-0.114687,1.833266,0.283443,...,-0.09865,-0.124703,1.070900,-0.182148,-0.177956,-0.065038,-0.167344,4.897038,0.010297,0.145419
BC03_03,-0.308470,-0.490251,-0.349352,1.407542,-0.331565,-0.181651,-0.197900,0.455748,-0.106144,-0.608594,...,-0.09865,-0.224135,-0.203544,-0.182148,-0.177956,-0.282549,-0.167344,-0.092049,-0.178027,-0.354455
BC03_06,-0.308470,-0.490251,-0.349352,-0.034456,-0.336960,-0.181651,-0.197900,0.409725,-0.106144,-0.608594,...,-0.09865,-0.224135,-0.203544,-0.182148,-0.177956,-0.033273,-0.167344,-0.092049,-0.178027,2.598518
BC03_09,-0.308470,0.697497,-0.349352,-0.655066,2.768817,-0.181651,-0.197900,-0.311299,-0.106144,0.556501,...,-0.09865,-0.224135,-0.203544,-0.182148,-0.177956,-0.282549,-0.167344,-0.092049,-0.178027,1.762013
BC03_11,0.627934,-0.490251,-0.349352,0.006041,1.405165,-0.181651,-0.197900,-0.197178,-0.106144,3.994792,...,-0.09865,-0.224135,-0.086401,-0.182148,-0.177956,-0.282549,-0.167344,-0.092049,-0.178027,-0.372283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BC07LN_90,-0.308470,0.425073,-0.349352,-0.655066,-0.335506,-0.181651,10.907255,-0.367174,-0.106144,-0.608594,...,-0.09865,-0.224135,-0.203544,-0.182148,-0.177956,-0.282549,-0.167344,-0.092049,-0.178027,-0.372283
BC07LN_91,0.638791,-0.316096,-0.286664,-0.083706,-0.332187,-0.181651,-0.112830,-0.120248,-0.106144,-0.580661,...,-0.09865,-0.224135,-0.037642,-0.182148,-0.177956,-0.282549,-0.167344,-0.092049,-0.178027,-0.372283
BC07LN_94,-0.308470,-0.480545,-0.281293,2.505363,-0.336960,-0.174234,-0.197900,1.848152,-0.106144,2.540791,...,-0.09865,-0.224135,-0.128659,-0.182148,-0.177956,-0.282549,-0.167344,-0.092049,-0.178027,-0.372283
BC07LN_95,-0.308470,0.277465,-0.349352,-0.655066,-0.336960,-0.181651,-0.145069,-0.367174,-0.106144,-0.233316,...,-0.09865,-0.224135,-0.203544,-0.182148,-0.177956,-0.233104,-0.167344,-0.092049,-0.178027,-0.340936


In [16]:
target = adata.obs.Target

In [None]:
#############################

In [17]:
rfc = RandomForestClassifier(n_jobs=10, n_estimators=100)
rfc.fit(adata.X,target)

RandomForestClassifier(n_jobs=10)

In [18]:
rfc.feature_importances_

array([0.00060548, 0.00033489, 0.        , ..., 0.        , 0.00020013,
       0.00018344])

In [19]:
importance_df = pd.DataFrame(rfc.feature_importances_, adata.var_names).sort_values(0,ascending=False)

In [24]:
importance_df.columns = ["importance"]

In [None]:
##########################

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df,target,test_size=0.2,random_state=42)

In [35]:
RFC = RandomForestClassifier(n_jobs=10,n_estimators=100)
RFC.fit(X_train,y_train)

RandomForestClassifier(n_jobs=10)

In [37]:
y_preds = RFC.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score

In [40]:
accuracy = accuracy_score(y_test, y_preds)

In [41]:
accuracy

0.725

In [31]:
!pip3 install leidenalg

Defaulting to user installation because normal site-packages is not writeable
Collecting leidenalg
  Downloading leidenalg-0.10.2-cp38-abi3-win_amd64.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 2.1 MB/s eta 0:00:00
Installing collected packages: leidenalg
Successfully installed leidenalg-0.10.2


