Resources: 
- https://www.kaggle.com/ferneutron/feature-extraction-with-different-methods
- https://towardsdatascience.com/why-how-and-when-to-apply-feature-selection-e9c69adfabf2

## Setting up

In [122]:
import pandas as pd # To handle the data set.
import seaborn as sb # To display visualizations.
import matplotlib.pyplot as plt # To plot
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.metrics import confusion_matrix # To calculate the confusion matrix
from sklearn.metrics import accuracy_score # To calculate the score
from sklearn.feature_selection import SelectKBest # Univariate Feature Selection
from sklearn.feature_selection import chi2 # To apply Univariate Feature Selection
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE # Recursive Feature Selection
from sklearn.feature_selection import RFECV # Recursive Feature Selection with Cross Validation
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA # To apply PCA
from sklearn import preprocessing # To get MinMax Scaler function

# To plot inline
%matplotlib inline

# Evaluation
from sklearn.metrics import accuracy_score
from yellowbrick.classifier import ClassificationReport
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# hide warnings
import warnings
#warnings.filterwarnings('ignore')

In [123]:
df = pd.read_csv('./datasets/feature_extracted.csv'
                 , sep=',', encoding='utf-8')
df.dropna();

In [124]:
# change label into binary classification
#df_raw = df.copy() # copy of df before manipulation just in case
df = df.drop('statement', axis=1)
df.label.loc[(df['label'] >= 0.5)] = 1;
df.label.loc[(df['label'] < 0.5)] = 0;
df_target = df['label']
df_feature = df.drop('label', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [125]:
df_feature.head(1)

Unnamed: 0,num_-,num_?,num_!,num_%,num_;,num_:,"num_""",num_(,num_$,"num_,",...,count_home,count_money,count_relig,count_death,count_informal,count_swear,count_netspeak,count_assent,count_nonflu,count_filler
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [126]:
# remove low variance data
def variance_threshold(df, threshold=0.0):
    selector = VarianceThreshold(threshold)
    selector.fit_transform(df)
    return df[df.columns[selector.get_support(indices=True)]]

# any column with a probability of having 0 variance above 0.8 will be eliminated
df_feature = variance_threshold(df_feature, (.8 * (1 - .8)))

In [127]:
# Initializing the MinMaxScaler function
min_max_scaler = preprocessing.MinMaxScaler()
#standard_scaler = preprocessing.StandardScaler()

# Scaling dataset keeping the columns name
df_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_feature), columns = df_feature.columns)
#X_scaled = pd.DataFrame(standard_scaler.fit_transform(X), columns = X.columns)

In [128]:
# Splitting  up data, seting 80% for train and 20% for test.
x_train, x_test, y_train, y_test = train_test_split(
    df_feature_scaled, df_target, test_size=0.2, random_state=42)

## Univariate selection

### Chi2

In [130]:
# Initialize SelectKBest function
selector_chi2 = SelectKBest(chi2,k=5).fit(x_train, y_train)

In [137]:
# Creating a dict to visualize which features were selected with the highest score
feature_dict_chi2 = {key:value for (key, value) in zip(
    selector_chi2.scores_, x_train.columns)}
sorted(feature_dict_chi2.items())

[(1.2561427591519733e-06, 'count_sad'),
 (1.0586202379453372e-05, 'count_adj'),
 (3.094525689537155e-05, 'count_cause'),
 (0.00012039136330320091, 'count_affect'),
 (0.00019276562762737028, 'count_article'),
 (0.00023214441170993618, 'count_reward'),
 (0.00042209711873857905, 'count_adverb'),
 (0.0004275441351266885, 'count_social'),
 (0.0006069260109999961, 'count_space'),
 (0.0007593804549486459, 'count_male'),
 (0.000870663332631151, 'count_achieve'),
 (0.001067686579636149, 'count_female'),
 (0.0011658144806705743, 'count_auxverb'),
 (0.001877910075846298, 'count_sexual'),
 (0.0023784128469482076, 'count_health'),
 (0.002976607834950838, 'count_netspeak'),
 (0.004998076678569283, 'count_anger'),
 (0.005960674692300825, 'count_assent'),
 (0.006632621203852159, 'num_"'),
 (0.011057737597873514, 'count_focuspast'),
 (0.011955816387660047, 'count_family'),
 (0.020469267388504343, 'count_death'),
 (0.023961462101278627, 'count_negate'),
 (0.02675259732160064, 'count_relativ'),
 (0.03643

In [132]:
x_train_chi2 = selector_chi2.transform(x_train)
x_test_chi2 = selector_chi2.transform(x_test)

In [133]:
x_train_chi2.shape
x_test_chi2.shape

(10192, 5)

(2549, 5)

## Anova

In [135]:
# Initialize SelectKBest function
selector_anova = SelectKBest(f_classif, k=5).fit(x_train, y_train)

In [138]:
feature_dict_f = {key:value for (key, value) in zip(
    selector_anova.scores_, x_train.columns)}
sorted(feature_dict_f.items())

[(1.9273336249599213e-05, 'count_sad'),
 (0.0001624677108456478, 'count_adj'),
 (0.0004791378201838556, 'count_cause'),
 (0.0016859824547826785, 'count_affect'),
 (0.0026973667698854797, 'count_article'),
 (0.0032463351023782248, 'count_reward'),
 (0.00592403196874872, 'count_social'),
 (0.006481893010642654, 'count_adverb'),
 (0.0084901870301539, 'count_space'),
 (0.010629461448599921, 'count_male'),
 (0.012188371638993374, 'count_achieve'),
 (0.014935982670329594, 'count_female'),
 (0.016324477069274878, 'count_auxverb'),
 (0.023598578134297336, 'num_"'),
 (0.026089187918419517, 'count_sexual'),
 (0.0349192809935412, 'count_health'),
 (0.04159502993888372, 'count_netspeak'),
 (0.0825721451646238, 'count_anger'),
 (0.0868048873512304, 'count_assent'),
 (0.16493204236397943, 'count_family'),
 (0.1714377637619494, 'count_focuspast'),
 (0.24832751486118695, 'count_pronoun'),
 (0.2612080973489306, 'count_discrep'),
 (0.27627567267906356, 'count_nonflu'),
 (0.3117855063686625, 'count_death

In [139]:
x_train_anova = selector_anova.transform(x_train)
x_test_anova = selector_anova.transform(x_test)

## Mutual information

In [None]:
sel_mutual = SelectKBest(mutual_info_classif, k=5)
X_train_mutual = sel_mutual.fit_transform(x_train, y_train)

In [None]:
x_train_k_best_m = sel_f.transform(x_train)
x_test_k_best_m = sel_f.transform(x_test)

In [None]:
RandForest_K_best_m = RandomForestClassifier()      
RandForest_K_best_m = RandForest_K_best.fit(x_train_k_best_m, y_train)
y_pred = RandForest_K_best.predict(x_test_k_best_m)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ',accuracy)

## Recursive Feature Elimination

In [None]:
# Initializing Random Forest Classifier
RandForest_RFE = RandomForestClassifier() 
# Initializing the RFE object, one of the most important arguments is the estimator, in this case is RandomForest
rfe = RFE(estimator=RandForest_RFE, n_features_to_select=5, step=1)
# Fit the origial dataset
rfe = rfe.fit(x_train, y_train)

In [None]:
print("Best features chosen by RFE: \n")
for i in x_train.columns[rfe.support_]:
    print(i)

In [None]:
x_train_RFE = rfe.transform(x_train)
x_test_RFE = rfe.transform(x_test)
RandForest_RFE = RandForest_RFE.fit(x_train_RFE, y_train)
y_pred = RandForest_RFE.predict(x_test_RFE)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ',accuracy)

In [None]:
confMatrix = confusion_matrix(y_test, y_pred)
sb.heatmap(confMatrix, annot=True, fmt="d")

## Recursive Feature Elimination with Cross-Validation

In [None]:
# Initialize the Random Forest Classifier
RandForest_RFECV = RandomForestClassifier() 
# Initialize the RFECV function setting 3-fold cross validation
rfecv = RFECV(estimator=RandForest_RFECV, step=1, cv=3, scoring='accuracy')
# Fit data
rfecv = rfecv.fit(x_train, y_train)

print('Best number of features :', rfecv.n_features_)
print('Features :\n')
for i in x_train.columns[rfecv.support_]:
    print(i)

In [None]:
plt.figure()
plt.xlabel("Number of Features")
plt.ylabel("Score of Selected Features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

## Tree based Feature Selection

In [None]:
# Initialize the Random Forest Classifier
RandForest_Tree = RandomForestClassifier()  
# Fit the random forest with the original data
RandForest_Tree = RandForest_Tree.fit(x_train, y_train)
# Getting the relevance between features
relevants = RandForest_Tree.feature_importances_

In [None]:
# Apply the tree based on importance for the random forest classifier and indexing it
std = np.std([tree.feature_importances_ for tree in RandForest_Tree.estimators_], axis=0)
indices = np.argsort(relevants)[::-1]

In [None]:
# Printting the ranking of importance
print("Feature Rank:")

for i in range(x_train.shape[1]):
    print("%d. Feature %d (%f)" 
          % (i + 1, indices[i], relevants[indices[i]]))

In [None]:
# Plotting the feature importances
plt.figure(1, figsize=(15, 5))
plt.title("Feature Importances")
plt.bar(range(x_train.shape[1]), relevants[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(x_train.shape[1]), x_train.columns[indices],rotation=90)
plt.xlim([-1, x_train.shape[1]])
plt.show();

## Feature Extraction through PCA

In [None]:
# Initializing PCA and fitting
pca = PCA()
pca.fit(x_train)

In [None]:
# Plotting to visualize the best number of elements
plt.figure(1, figsize=(9, 8))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2)
plt.axis('tight')
plt.xlabel('Number of Feautres')
plt.ylabel('Variance Ratio')
plt.show();

## Before and after feature selection, accuracy comparison

### Random Forest

In [146]:
RandForest = RandomForestClassifier()      
RandForest = RandForest.fit(x_train, y_train)
y_pred_RF = RandForest.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_RF)
print('RF with all features Accuracy: ',accuracy)

RF with all features Accuracy:  0.5202040015692428




In [147]:
RandForest_chi2 = RandomForestClassifier()      
RandForest_chi2 = RandForest_chi2.fit(x_train_chi2, y_train)
y_pred_chi2 = RandForest_K_best.predict(x_test_chi2)
accuracy = accuracy_score(y_test, y_pred_chi2)
print('RF with Chi2 Accuracy: ',accuracy)

RF with Chi2 Accuracy:  0.5221655551196548




In [148]:
RandForest_anova = RandomForestClassifier()      
RandForest_anova = RandForest_K_best.fit(x_train_anova, y_train)
y_pred_anova = RandForest_K_best.predict(x_test_anova)
accuracy = accuracy_score(y_test, y_pred_anova)
print('RF with Anova Accuracy: ',accuracy)

RF with Anova Accuracy:  0.5476657512750098


In [149]:
model_logistic = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=10000)
model_logistic.fit(x_train, y_train)
predict = model_logistic.predict(x_test)
print("confusion matrix", confusion_matrix(y_test, predict))
print("accuracy score:", accuracy_score(y_test, predict))
print("f1 score:", f1_score(y_test, predict))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

confusion matrix [[ 224  903]
 [ 167 1255]]
accuracy score: 0.5802275402118477
f1 score: 0.7011173184357543
