In [1]:
"""
Filter methods are generally used as a preprocessing step. The selection of
attributes/features is independent of any machine learning algorithms. Instead, 
attributes are selected on the basis of their scores in various statistical 
tests for their correlation with the outcome variable. 

Wrapper/Embedded methods for attribute selection were considered but could 
not be utilized due to runtime. Multiple models are created and tested with
each pass increasing as the number of attributes increases, therefore would
only be viable for datasets of around ~20 attributes. 
"""

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_selection import SelectPercentile, f_regression

df = pd.read_csv("Dataset_Github_Labeled.csv")
x=df.drop(['class'], axis=1)
# print(x.tail())
y=df['class']

# change y in the csv file to be assigned to one of three classes: High-grade, Low-grade, Normal
pd.set_option('mode.chained_assignment', None)
for i in range (0,324): # 0 - 323, same size as x
    #print(type(y[i]))
    if df['class'][i].startswith('High-grade'):  # if the last column contains text "High-grade", etc below.
        df['class'][i] = 'High-grade'
    elif df['class'][i].startswith('Low-grade'):
        df['class'][i] = 'Low-grade'
    elif df['class'][i].startswith('Normal'):
        df['class'][i] = 'Normal'
# print (df.head())

# Encode target variable (y)
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()
y= lbl_encoder.fit_transform(df['class'])
# print(y) # shows how the classes are numerically assigned through this change, by 0,1, or 2

In [2]:
# select only the top x% of features most correlated to the target variable. 
top_x_percent_features = SelectPercentile(f_regression, percentile=1).fit(x, y)
indices = top_x_percent_features.get_support(True)
print("These are the indices of the words in the top percentage for correlation of IC: 1x300")
print(indices)
top_x_percent_features=top_x_percent_features.transform(x)
print("This is the data accociated with each word: 148x300")
print(top_x_percent_features)
print("top_x_percent_features.shape: ", top_x_percent_features.shape)

These are the indices of the words in the top percentage for correlation of IC: 1x300
[43 44 45 46 48 49 50 51 52 53 54 55 56 57]
This is the data accociated with each word: 148x300
[[ 0.00923459 -0.00503947 -0.01090122 ...  0.03525511  0.03882811
   0.0410134 ]
 [ 0.05830147  0.05247059  0.04463594 ...  0.04197936  0.04066504
   0.03818964]
 [ 0.03734988  0.03838834  0.03837622 ...  0.02968341  0.03284243
   0.03635363]
 ...
 [-0.17358487 -0.18647616 -0.19930839 ... -0.07645029 -0.08237044
  -0.08957736]
 [-0.13591013 -0.17940675 -0.21859271 ... -0.15767263 -0.13891516
  -0.10323525]
 [-0.08044461 -0.08512978 -0.08807086 ...  0.01298033  0.01124201
   0.01155755]]
top_x_percent_features.shape:  (324, 14)


In [3]:
# useful_features will be the top 10% of words,which after mannually
# filtering will be input into knime prediction algorithms. 
useful_feature_columns=[]
for elem in indices: # elem = column name, str
    useful_feature_columns.append(df.columns[elem])

In [4]:
# store useful words and features_new_percentile into a dataframe and export
# it as a csv file named useful_words_unfiltered.csv
i=0 # i represents the correct index from useful words and indices
useful_features_df = pd.DataFrame()
for elem in df: # elem represents the column name in df 
    if i < len(useful_feature_columns) and useful_feature_columns[i] == elem:
        useful_features_df[elem] = df.iloc[:,indices[i]]
        i+=1
useful_features_df['class'] = df.iloc[:,1367]
useful_features_df.shape

(324, 15)

In [5]:
useful_features_df

Unnamed: 0,44,45,46,47,49,50,51,52,53,54,55,56,57,58,class
0,0.009235,-0.005039,-0.010901,-0.011113,-0.012657,-0.008263,-0.001059,0.006600,0.014492,0.022633,0.029907,0.035255,0.038828,0.041013,High-grade
1,0.058301,0.052471,0.044636,0.038812,0.031327,0.029738,0.029778,0.031626,0.034681,0.038481,0.041422,0.041979,0.040665,0.038190,High-grade
2,0.037350,0.038388,0.038376,0.037082,0.031165,0.027429,0.024327,0.022960,0.023200,0.024617,0.026905,0.029683,0.032842,0.036354,High-grade
3,0.042067,0.042728,0.043539,0.044145,0.043726,0.042965,0.042016,0.041044,0.040001,0.038802,0.037506,0.036095,0.034710,0.033477,High-grade
4,0.038427,0.030893,0.028235,0.030835,0.047326,0.059380,0.069034,0.071380,0.068403,0.063301,0.058752,0.057699,0.059073,0.060811,High-grade
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,-0.029223,-0.047347,-0.060314,-0.063441,-0.033074,-0.001810,0.030006,0.049800,0.060637,0.069310,0.075353,0.078785,0.080648,0.080989,Normal
320,-0.070879,-0.051121,-0.024590,0.001054,0.026383,0.026772,0.023327,0.022308,0.022672,0.021214,0.020468,0.023199,0.028329,0.034440,Normal
321,-0.173585,-0.186476,-0.199308,-0.202696,-0.157468,-0.123757,-0.093601,-0.075357,-0.068062,-0.067869,-0.071674,-0.076450,-0.082370,-0.089577,Normal
322,-0.135910,-0.179407,-0.218593,-0.221635,-0.051289,0.030249,0.068650,0.045785,-0.017418,-0.088737,-0.143319,-0.157673,-0.138915,-0.103235,Normal


In [6]:
useful_features_df.to_csv(r'C:\Users\R-k-l\AppData\Local\Programs\Python\Python37\Scripts\Capstone\useful_features_1.csv', index=False)