# Feature selection using lasso

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from imblearn.over_sampling import SMOTE

In [2]:
# read in file
df = pd.read_csv("Brain_GSE50161.csv")

In [3]:
df

Unnamed: 0,samples,type,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,834,ependymoma,12.498150,7.604868,6.880934,9.027128,4.176175,7.224920,6.085942,6.835999,...,9.979005,9.926470,12.719785,12.777792,5.403657,4.870548,4.047380,3.721936,4.516434,4.749940
1,835,ependymoma,13.067436,7.998090,7.209076,9.723322,4.826126,7.539381,6.250962,8.012549,...,11.924749,11.215930,13.605662,13.401342,5.224555,4.895315,3.786437,3.564481,4.430891,4.491416
2,836,ependymoma,13.068179,8.573674,8.647684,9.613002,4.396581,7.813101,6.007746,7.178156,...,12.154405,11.532460,13.764593,13.477800,5.303565,5.052184,4.005343,3.595382,4.563494,4.668827
3,837,ependymoma,12.456040,9.098977,6.628784,8.517677,4.154847,8.361843,6.596064,6.347285,...,11.969072,11.288801,13.600828,13.379029,4.953429,4.708371,3.892318,3.759429,4.748381,4.521275
4,838,ependymoma,12.699958,8.800721,11.556188,9.166309,4.165891,7.923826,6.212754,6.866387,...,11.411701,11.169317,13.751442,13.803646,4.892677,4.773806,3.796856,3.577544,4.504385,4.541450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,959,pilocytic_astrocytoma,12.658228,8.843270,7.672655,9.125912,5.495477,8.603892,7.747514,5.828978,...,13.170441,12.676080,14.124837,13.996436,4.913579,4.399176,3.878855,3.680103,4.726784,4.564637
126,960,pilocytic_astrocytoma,12.812823,8.510550,8.729699,9.104402,3.967228,7.719089,7.092496,6.504812,...,13.040267,12.403316,13.978009,13.812916,5.189600,4.912618,3.764800,3.664920,4.628355,4.761351
127,961,pilocytic_astrocytoma,12.706991,8.795721,7.772359,8.327273,6.329383,8.550471,6.613332,6.308945,...,12.825383,12.439265,14.328373,14.008693,4.931460,4.712895,3.913637,3.700964,4.764693,4.834952
128,962,pilocytic_astrocytoma,12.684593,8.293938,7.228186,8.494428,6.049414,8.214729,7.287758,5.732710,...,13.116581,12.657967,14.390346,14.194904,4.871092,4.739400,3.782980,3.920363,4.665584,4.613326


In [4]:
# first column is the samples
# second column is the types for brain cancers.
print(df['type'].value_counts())
# replace type to numbers
df['type'] = df['type'].replace(['ependymoma', 'glioblastoma','medulloblastoma', 'pilocytic_astrocytoma','normal'],[0,1,2,3,4])

ependymoma               46
glioblastoma             34
medulloblastoma          22
pilocytic_astrocytoma    15
normal                   13
Name: type, dtype: int64


In [5]:
X = df.iloc[:,2:]
y = df.iloc[:,1]

In [6]:
# run lasso
reg = LassoCV()
reg.fit(X, y)
print("lasso done")

lasso done


In [7]:
# get coefficients for each feature
# keep non-zero coefficient feature, eliminate zero coefficient feature
coef = pd.Series(reg.coef_, index = X.columns)
print("Number of kept features:", sum(coef != 0))
print("Number of eliminated features:", sum(coef == 0))


Number of kept features: 76
Number of eliminated features: 54599


In [8]:
# select the features based on the coefficient from Lasso
x_lasso = df.iloc[:,2:].loc[:,coef != 0]
df_lasso = x_lasso.copy()
df_lasso.insert(0,"type",y)

In [9]:
# Run SMOTE
x_smote = df_lasso.iloc[:,1:]
y_smote = df_lasso.iloc[:,0]
oversample = SMOTE()
x_smote, y_smote = oversample.fit_resample(x_smote, y_smote)

In [10]:
# write out the datafrom
df_smote = x_smote.copy()
df_smote.insert(0, "type", y_smote)
df_smote.to_csv("df_lasso.csv", index = False)

In [11]:
df_smote

Unnamed: 0,type,1552365_at,1553613_s_at,1554997_a_at,1555778_a_at,1557395_at,1558009_at,1560263_at,1569110_x_at,1569191_at,...,230865_at,231192_at,231930_at,232099_at,232164_s_at,237939_at,239591_at,240317_at,242344_at,32502_at
0,0,7.027073,6.095727,4.934580,5.574216,6.616401,4.636203,6.979441,10.165542,4.576461,...,10.112137,4.935503,7.593587,5.532449,8.502690,4.204577,3.387150,6.837423,5.202238,6.788211
1,0,5.889348,5.954544,5.117363,5.213474,5.268000,6.957785,6.701198,6.826481,4.242329,...,6.903574,10.912595,7.466224,4.418672,5.213158,4.862734,3.398312,6.969566,4.315842,9.612831
2,0,8.135750,6.747295,9.510451,5.691222,7.572080,9.039582,6.122982,10.966592,5.721972,...,9.993307,7.559065,7.654780,6.860873,6.093020,4.227184,4.741416,6.733526,5.095745,8.210775
3,0,7.230814,6.399526,4.571579,4.739206,8.738774,4.759947,5.477097,10.522992,7.146460,...,7.378329,3.964027,9.059476,5.838273,11.229693,9.017996,3.371509,6.646348,7.077832,8.208695
4,0,6.295379,7.000134,5.478176,5.127104,8.812909,5.536381,5.827314,11.049280,5.568936,...,7.805245,4.160490,5.383418,6.232034,6.794552,4.236953,3.316260,6.651637,5.385728,7.713718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,4,5.212287,6.854842,8.493602,5.756567,3.918706,10.296596,6.202231,4.959866,3.911274,...,8.249442,4.220634,12.095948,6.062197,5.362149,9.232988,4.224714,8.649788,12.582284,9.147635
226,4,4.784385,6.131198,4.679684,5.294106,3.599941,8.084886,7.245531,5.986035,4.416267,...,8.095490,4.427455,11.237634,6.578784,7.371972,6.288613,6.675697,8.768033,12.239048,10.344947
227,4,5.194054,6.947100,8.712198,5.728755,3.916517,9.518869,6.345069,4.963255,4.038738,...,8.468976,4.323393,12.064123,6.129072,5.365718,9.394807,4.690194,8.581144,12.622493,9.124505
228,4,6.155152,6.916116,4.975128,5.483650,3.840623,8.259994,7.515524,5.835737,4.844067,...,9.209904,4.297876,11.303939,5.742675,5.246025,5.834841,5.275173,8.181706,11.913431,10.691907
