# Feature selection and Dimensionality reduction

The goal of this investigation is to obtain the best predictions and to select the smallest possible
subset of relevant input variables (features).

The dataset is already divided into train and validation sets (both in 2 forms - unbalanced and balanced). It was done for both **teenagers** and **amItheAsshole** subreddits.

On train and validation sets, there will be used the same dimensionality reduction technique at the same moment (to make the rsults fair) -- the **accuracy** metric value will be calculated on validation data set.

In [1]:
# importing packages

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import sklearn
import pandas as pd, numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# pca
from sklearn.decomposition import PCA, KernelPCA

In [3]:
# importing train and validation AITA datasets:

X_train = pd.read_csv("data/aita_train.csv", index_col=0, dtype = {'flair': 'string', 'title': 'string', 'text': 'string'})
X_trainb = pd.read_csv("data/aita_train_balanced.csv", index_col=0, dtype = {'flair': 'string', 'title': 'string', 'text': 'string'})
X_test = pd.read_csv("data/aita_test.csv", index_col=0, dtype = {'flair': 'string', 'title': 'string', 'text': 'string'})
X_testb = pd.read_csv("data/aita_test_balanced.csv", index_col=0, dtype = {'flair': 'string', 'title': 'string', 'text': 'string'})


In [5]:
# chosing columns

X_train = X_train[["flair", "title", "text", "title_wc", "selftext_wc", "title_cc", "selftext_cc"]]
X_trainb = X_trainb[["flair", "title", "text", "title_wc", "selftext_wc", "title_cc", "selftext_cc"]]

X_test = X_test[["flair", "title", "text", "title_wc", "selftext_wc", "title_cc", "selftext_cc"]]
X_testb = X_testb[["flair", "title", "text", "title_wc", "selftext_wc", "title_cc", "selftext_cc"]]




In [6]:
vectorizer = TfidfVectorizer(stop_words = 'english', max_features=500)


In [7]:
def replaceThisInsaneNa(x):
    if pd.isna(x):
        return " "
    else:
        return x

In [7]:
X_test.isna().sum()

flair           0
title           0
text           15
title_wc        0
selftext_wc     0
title_cc        0
selftext_cc     0
dtype: int64

In [8]:

X_train.text = X_train.text.apply(replaceThisInsaneNa)
X_trainb.text = X_trainb.text.apply(replaceThisInsaneNa)

X_test.text = X_test.text.apply(replaceThisInsaneNa)
X_testb.text = X_testb.text.apply(replaceThisInsaneNa)

In [9]:

def datasetToTfIdfPredictorsAndLabels(df, vectorizer):
    
    print("STEP 1")
    
    text_bgw = pd.DataFrame(vectorizer.fit_transform(df.text).toarray(), dtype = 'float32').reset_index(drop=True)
    text_bgw.columns = vectorizer.get_feature_names()
    new_columns = []

    for column in text_bgw.columns:
        new_columns.append("text_" + column)

    text_bgw.columns = new_columns   
    
    print("STEP 2")
    
    title_bgw = pd.DataFrame(vectorizer.fit_transform(df.title).toarray(), dtype = 'float32').reset_index(drop=True)
    title_bgw.columns = vectorizer.get_feature_names()
    
    new_columns = []

    for column in title_bgw.columns:
        new_columns.append("title_" + column)

    title_bgw.columns = new_columns
    
    print("STEP 3 - concat")
    
    X = pd.concat([df[["title_wc", "selftext_wc", "title_cc", "selftext_cc"]].reset_index(drop=True), text_bgw, title_bgw], axis=1, join="inner")
    
    return X, df[["flair"]]



In [10]:
X_train, y_train = datasetToTfIdfPredictorsAndLabels(X_train, vectorizer)

STEP 1
STEP 2
STEP 3 - concat


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337074 entries, 0 to 337073
Columns: 1004 entries, title_wc to title_younger
dtypes: float32(1000), int64(4)
memory usage: 1.3 GB


In [12]:
X_test, y_test = datasetToTfIdfPredictorsAndLabels(X_test, vectorizer)

STEP 1
STEP 2
STEP 3 - concat


In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37275 entries, 0 to 37274
Columns: 1004 entries, title_wc to title_younger
dtypes: float32(1000), int64(4)
memory usage: 143.3 MB


In [14]:
X_trainb, y_trainb = datasetToTfIdfPredictorsAndLabels(X_trainb, vectorizer)

STEP 1
STEP 2
STEP 3 - concat


In [15]:
X_trainb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Columns: 1004 entries, title_wc to title_yta
dtypes: float32(1000), int64(4)
memory usage: 61.5 MB


In [16]:
X_testb, y_testb = datasetToTfIdfPredictorsAndLabels(X_testb, vectorizer)

STEP 1
STEP 2
STEP 3 - concat


In [17]:
X_testb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Columns: 1004 entries, title_wc to title_yta
dtypes: float32(1000), int64(4)
memory usage: 6.2 MB


In [18]:
from collections import OrderedDict
from functools import partial
from time import time

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter

from sklearn import manifold

# Feature selection using chi squared




In [44]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

In [37]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [45]:
X_trainb = SelectKBest(chi2, k=500).fit_transform(X_trainb, lb_make.fit_transform(y_trainb))

X_testb = SelectKBest(chi2, k=500).fit_transform(X_testb, lb_make.fit_transform(y_testb))


In [43]:
X_trainb.shape

(16000, 1004)

# Fitting simple methods - for initial insight plots - do not use

In [38]:
# do not use

# Next line to silence pyflakes. This import is needed.
Axes3D

X, color = X_trainb, lb_make.fit_transform(y_trainb)
n_neighbors = 10
n_components = 2

# Set-up manifold methods
LLE = partial(
    manifold.LocallyLinearEmbedding,
    n_neighbors=n_neighbors,
    n_components=n_components,
    eigen_solver="auto",
)



methods = OrderedDict()
#methods["LLE"] = LLE(method="standard")
#methods["LTSA"] = LLE(method="ltsa")
#methods["Hessian LLE"] = LLE(method="hessian")
#methods["Modified LLE"] = LLE(method="modified")
#methods["Isomap"] = manifold.Isomap(n_neighbors=n_neighbors, n_components=n_components)
#methods["MDS"] = manifold.MDS(n_components, max_iter=100, n_init=1)
methods["SE"] = manifold.SpectralEmbedding(
    n_components=n_components, n_neighbors=n_neighbors
)
methods["t-SNE"] = manifold.TSNE(n_components=n_components, init="pca", random_state=44)



  return f(*args, **kwargs)


# Machine learning models for dimensionality techniques reduction comparison - linear SVM, Random Forest and Naive Bayes





In [19]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [27]:
model = LinearSVC()
model2 = RandomForestClassifier()
model3 = GaussianNB()

model.fit(X_trainb, y_trainb)

LinearSVC()

In [30]:
#pred = model.predict(X_pca_test)

model.score(X_testb, y_testb)

0.13875

# Searching the best dimensionalty reduction method - 1 up to 50 components for each method ( nearest neighbors = 2, 10 or 20)

In [49]:
df = pd.DataFrame(columns=['method', 'model', 'n_components', 'n_neighbors', 'accuracy', 'train', 'test'])

In [35]:
import warnings
warnings.filterwarnings("ignore")

# PCA

In [55]:
#neighbors = [3,10,30]

neighbors = [5]

method_name = "PCA"

for i in range(3,100, 10):
    
    if(i % 10 ) == 3:
        print("n_components: ", i)
    
    for n in neighbors:
        
        method = PCA(n_components=i)
        
        t0 = time()
        print("fitting DR method - train")
        X_pca = method.fit(X_train).transform(X_train)
        X_pcab = method.fit(X_trainb).transform(X_trainb)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting DR method - test")
        t0 = time()
        X_pca_test = method.fit(X_test).transform(X_test)
        X_pca_testb = method.fit(X_testb).transform(X_testb)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting SVM - balanced")
        t0 = time()
        # trained on balanced dataset
        model.fit(X_pcab, y_trainb)
        acc = model.score(X_pca_testb, y_testb)
        df = df.append({'method': method_name, 'model':'SVC', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'balanced'}, ignore_index=True)
        acc = model.score(X_pca_test, y_test)
        df = df.append({'method': method_name, 'model':'SVC', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'unbalanced'}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting RF - balanced")
        t0 = time()
        model2.fit(X_pcab, y_trainb)
        acc = model2.score(X_pca_testb, y_testb)
        df = df.append({'method': method_name,'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'balanced'}, ignore_index=True)
        acc = model.score(X_pca_test, y_test)
        df = df.append({'method': method_name, 'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'unbalanced'}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting NB - balanced")
        t0 = time()
        model3.fit(X_pcab, y_trainb)
        acc = model3.score(X_pca_testb, y_testb)
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'balanced'}, ignore_index=True)
        acc = model.score(X_pca_test, y_test)
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'unbalanced'}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        # trained on unbalanced dataset
        
        print("fitting SVM - unbalanced")
        t0 = time()
        model.fit(X_pca, y_train)
        acc = model.score(X_pca_testb, y_testb)
        df = df.append({'method': method_name, 'model':'SVC', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'balanced'}, ignore_index=True)
        acc = model.score(X_pca_test, y_test)
        df = df.append({'method': method_name, 'model':'SVC', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'unbalanced'}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting RF - unbalanced")
        t0 = time()
        model2.fit(X_pca, y_train)
        acc = model2.score(X_pca_testb, y_testb)
        df = df.append({'method': method_name,'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'balanced'}, ignore_index=True)
        acc = model.score(X_pca_test, y_test)
        df = df.append({'method': method_name, 'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'unbalanced'}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting NB - unbalanced")
        t0 = time()
        model3.fit(X_pca, y_train)
        acc = model3.score(X_pca_testb, y_testb)
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'balanced'}, ignore_index=True)
        acc = model.score(X_pca_test, y_test)
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'unbalanced'}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        
        

n_components:  3
fitting DR method - train
Time: : 46 sec
fitting DR method - test
Time: : 2.6 sec
fitting SVM - balanced
Time: : 7.1 sec
fitting RF - balanced
Time: : 3.4 sec
fitting NB - balanced
Time: : 0.082 sec
fitting SVM - unbalanced
Time: : 6.4e+02 sec
fitting RF - unbalanced
Time: : 1.2e+02 sec
fitting NB - unbalanced
Time: : 0.69 sec
n_components:  13
fitting DR method - train
Time: : 52 sec
fitting DR method - test
Time: : 3 sec
fitting SVM - balanced
Time: : 17 sec
fitting RF - balanced
Time: : 6.9 sec
fitting NB - balanced
Time: : 0.087 sec
fitting SVM - unbalanced
Time: : 1.1e+03 sec
fitting RF - unbalanced
Time: : 3e+02 sec
fitting NB - unbalanced
Time: : 1.3 sec
n_components:  23
fitting DR method - train
Time: : 78 sec
fitting DR method - test
Time: : 5.7 sec
fitting SVM - balanced
Time: : 29 sec
fitting RF - balanced
Time: : 9.4 sec
fitting NB - balanced
Time: : 0.11 sec
fitting SVM - unbalanced
Time: : 1.4e+03 sec
fitting RF - unbalanced
Time: : 4.1e+02 sec
fitting N

KeyboardInterrupt: 

In [64]:
df.sort_values("accuracy", ascending = False).head(20)

Unnamed: 0,method,model,n_components,n_neighbors,accuracy,train,test
49,PCA,SVC,33,5,0.544145,unbalanced,unbalanced
51,PCA,RF,33,5,0.544145,unbalanced,unbalanced
53,PCA,NB,33,5,0.544145,unbalanced,unbalanced
37,PCA,SVC,23,5,0.500791,unbalanced,unbalanced
41,PCA,NB,23,5,0.500791,unbalanced,unbalanced
39,PCA,RF,23,5,0.500791,unbalanced,unbalanced
29,PCA,NB,13,5,0.438524,unbalanced,unbalanced
27,PCA,RF,13,5,0.438524,unbalanced,unbalanced
25,PCA,SVC,13,5,0.438524,unbalanced,unbalanced
13,PCA,SVC,3,5,0.414809,unbalanced,unbalanced


In [82]:
from sklearn.metrics import classification_report

from sklearn.metrics import precision_recall_fscore_support

In [77]:
y_pred = model2.predict(X_pca_test)

print(classification_report(y_test, y_pred))

repp = classification_report(y_test, y_pred, output_dict = True)

                 precision    recall  f1-score   support

        asshole       0.22      0.14      0.17      7212
 everyone sucks       0.06      0.18      0.10      2186
           meta       0.00      0.03      0.01       327
no a-holes here       0.11      0.16      0.13      3789
not enough info       0.03      0.15      0.05      1122
 not the a-hole       0.63      0.22      0.33     21833
          tl;dr       0.01      0.02      0.01       319
         update       0.08      0.74      0.15       487

       accuracy                           0.20     37275
      macro avg       0.14      0.20      0.12     37275
   weighted avg       0.43      0.20      0.25     37275



In [70]:
np.unique(y_test, return_counts=True)

(array(['asshole', 'everyone sucks', 'meta', 'no a-holes here',
        'not enough info', 'not the a-hole', 'tl;dr', 'update'],
       dtype=object),
 array([ 7212,  2186,   327,  3789,  1122, 21833,   319,   487],
       dtype=int64))

In [86]:
#model2.fit(X_pcab, y_trainb)

y_pred = model2.predict(X_pca_testb)

print(precision_recall_fscore_support(y_testb, y_pred, average = 'weighted'))

print(precision_recall_fscore_support(y_testb, y_pred, average = 'weighted')[2])

(0.3459845139550296, 0.355625, 0.3395067320665801, None)
0.3395067320665801


# PCA - Naive Bayes

In [89]:
df = pd.DataFrame(columns=['method', 'model', 'n_components', 'n_neighbors', 'accuracy', 'train', 'test', 'precision', 'recall', 'f1'])

In [95]:
#neighbors = [3,10,30]

neighbors = [5]

method_name = "PCA"

for i in range(50,150, 50):
    
    if(i % 10 ) == 0:
        print("n_components: ", i)
    
    for n in neighbors:
        
        method = PCA(n_components=i)
        
        t0 = time()
        print("fitting DR method - train")
        X_pca = method.fit(X_train).transform(X_train)
        X_pcab = method.fit(X_trainb).transform(X_trainb)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting DR method - test")
        t0 = time()
        X_pca_test = method.fit(X_test).transform(X_test)
        X_pca_testb = method.fit(X_testb).transform(X_testb)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        
        print("fitting NB - balanced")
        t0 = time()
        model3.fit(X_pcab, y_trainb)
        acc = model3.score(X_pca_testb, y_testb)
        y_pred = model3.predict(X_pca_testb)
        prf = precision_recall_fscore_support(y_testb, y_pred, average = 'weighted')
        
        
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'balanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        acc = model3.score(X_pca_test, y_test)
        y_pred = model3.predict(X_pca_test)
        prf = precision_recall_fscore_support(y_test, y_pred, average = 'weighted')
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'unbalanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        # trained on unbalanced dataset
        
        print("fitting NB - unbalanced")
        t0 = time()
        model3.fit(X_pca, y_train)
        acc = model3.score(X_pca_testb, y_testb)
        
                
        y_pred = model3.predict(X_pca_testb)
        prf = precision_recall_fscore_support(y_testb, y_pred, average = 'weighted')
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'balanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        
        acc = model3.score(X_pca_test, y_test)
        y_pred = model3.predict(X_pca_test)
        prf = precision_recall_fscore_support(y_test, y_pred, average = 'weighted')
        df = df.append({'method': method_name, 'model':'NB', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'unbalanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        

n_components:  50
fitting DR method - train
Time: : 92 sec
fitting DR method - test
Time: : 7.1 sec
fitting NB - balanced
Time: : 1.4 sec
fitting NB - unbalanced
Time: : 2.5 sec
n_components:  100
fitting DR method - train
Time: : 1.1e+02 sec
fitting DR method - test
Time: : 11 sec
fitting NB - balanced
Time: : 2.1 sec
fitting NB - unbalanced
Time: : 3.5 sec
n_components:  150
fitting DR method - train
Time: : 1.1e+02 sec
fitting DR method - test
Time: : 8.8 sec
fitting NB - balanced
Time: : 2.3 sec
fitting NB - unbalanced
Time: : 4.2 sec
n_components:  200
fitting DR method - train
Time: : 1.4e+02 sec
fitting DR method - test
Time: : 13 sec
fitting NB - balanced
Time: : 2.8 sec
fitting NB - unbalanced
Time: : 5 sec
n_components:  250
fitting DR method - train
Time: : 1.7e+02 sec
fitting DR method - test
Time: : 16 sec
fitting NB - balanced
Time: : 3.6 sec
fitting NB - unbalanced
Time: : 6.4 sec
n_components:  300
fitting DR method - train
Time: : 2e+02 sec
fitting DR method - test
Tim

In [99]:
df[df["test"] == "unbalanced"].sort_values("f1", ascending=False).head(10)

Unnamed: 0,method,model,n_components,n_neighbors,accuracy,train,test,precision,recall,f1
59,PCA,NB,31,5,0.5255,unbalanced,unbalanced,0.443489,0.5255,0.469534
63,PCA,NB,33,5,0.52448,unbalanced,unbalanced,0.444224,0.52448,0.468273
51,PCA,NB,27,5,0.557022,unbalanced,unbalanced,0.438795,0.557022,0.466103
75,PCA,NB,39,5,0.514608,unbalanced,unbalanced,0.447631,0.514608,0.465848
67,PCA,NB,35,5,0.517612,unbalanced,unbalanced,0.443373,0.517612,0.465203
43,PCA,NB,23,5,0.56939,unbalanced,unbalanced,0.442898,0.56939,0.465182
71,PCA,NB,37,5,0.517693,unbalanced,unbalanced,0.443321,0.517693,0.464365
55,PCA,NB,29,5,0.545781,unbalanced,unbalanced,0.434269,0.545781,0.461341
47,PCA,NB,25,5,0.568343,unbalanced,unbalanced,0.438262,0.568343,0.460831
39,PCA,NB,21,5,0.57218,unbalanced,unbalanced,0.436685,0.57218,0.458899


In [100]:
df[df["test"] == "balanced"].sort_values("f1", ascending=False).head(10)

Unnamed: 0,method,model,n_components,n_neighbors,accuracy,train,test,precision,recall,f1
12,PCA,NB,9,5,0.385625,balanced,balanced,0.389076,0.385625,0.379426
16,PCA,NB,11,5,0.361875,balanced,balanced,0.372316,0.361875,0.361474
28,PCA,NB,17,5,0.35625,balanced,balanced,0.36946,0.35625,0.352461
44,PCA,NB,25,5,0.3575,balanced,balanced,0.359837,0.3575,0.352055
24,PCA,NB,15,5,0.354375,balanced,balanced,0.367979,0.354375,0.351877
40,PCA,NB,23,5,0.35,balanced,balanced,0.360687,0.35,0.346482
20,PCA,NB,13,5,0.348125,balanced,balanced,0.359462,0.348125,0.345319
32,PCA,NB,19,5,0.3475,balanced,balanced,0.359275,0.3475,0.343617
48,PCA,NB,27,5,0.34125,balanced,balanced,0.349432,0.34125,0.339215
36,PCA,NB,21,5,0.343125,balanced,balanced,0.351198,0.343125,0.338337


In [101]:
df.to_csv("nb_results1.csv")

# Further Random Forest

In [102]:
#neighbors = [3,10,30]

neighbors = [5]

method_name = "PCA"

for i in range(50,150, 25):
    
    print("n_components: ", i)
    
    for n in neighbors:
        
        method = PCA(n_components=i)
        
        t0 = time()
        print("fitting DR method - train")
        X_pca = method.fit(X_train).transform(X_train)
        X_pcab = method.fit(X_trainb).transform(X_trainb)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        print("fitting DR method - test")
        t0 = time()
        X_pca_test = method.fit(X_test).transform(X_test)
        X_pca_testb = method.fit(X_testb).transform(X_testb)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        
        print("fitting RF - balanced")
        t0 = time()
        model2.fit(X_pcab, y_trainb)
        acc = model2.score(X_pca_testb, y_testb)
        y_pred = model2.predict(X_pca_testb)
        prf = precision_recall_fscore_support(y_testb, y_pred, average = 'weighted')
        
        
        df = df.append({'method': method_name, 'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'balanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        acc = model2.score(X_pca_test, y_test)
        y_pred = model2.predict(X_pca_test)
        prf = precision_recall_fscore_support(y_test, y_pred, average = 'weighted')
        df = df.append({'method': method_name, 'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'balanced', 'test': 'unbalanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        # trained on unbalanced dataset
        
        print("fitting RF - unbalanced")
        t0 = time()
        model2.fit(X_pca, y_train)
        acc = model2.score(X_pca_testb, y_testb)
        
                
        y_pred = model2.predict(X_pca_testb)
        prf = precision_recall_fscore_support(y_testb, y_pred, average = 'weighted')
        df = df.append({'method': method_name, 'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'balanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        
        acc = model2.score(X_pca_test, y_test)
        y_pred = model2.predict(X_pca_test)
        prf = precision_recall_fscore_support(y_test, y_pred, average = 'weighted')
        df = df.append({'method': method_name, 'model':'RF', 'n_components': i, 'n_neighbors': n, 'accuracy': acc, 'train': 'unbalanced', 'test': 'unbalanced', 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}, ignore_index=True)
        t1 = time()
        print("%s: %.2g sec" % ('Time: ', t1 - t0))
        
        

n_components:  50
fitting DR method - train
Time: : 1e+02 sec
fitting DR method - test
Time: : 7.1 sec
fitting RF - balanced
Time: : 27 sec
fitting RF - unbalanced
Time: : 1.1e+03 sec
n_components:  75
fitting DR method - train
Time: : 1.1e+02 sec
fitting DR method - test
Time: : 9.7 sec
fitting RF - balanced
Time: : 31 sec
fitting RF - unbalanced
Time: : 1.3e+03 sec
n_components:  100
fitting DR method - train


KeyboardInterrupt: 

In [103]:
df[df["model"] == "RF"].sort_values("f1", ascending=False).head(10)

Unnamed: 0,method,model,n_components,n_neighbors,accuracy,train,test,precision,recall,f1
107,PCA,RF,50,5,0.595252,unbalanced,unbalanced,0.444287,0.595252,0.459178
111,PCA,RF,75,5,0.594956,unbalanced,unbalanced,0.472996,0.594956,0.454187
104,PCA,RF,50,5,0.375625,balanced,balanced,0.376427,0.375625,0.360278
108,PCA,RF,75,5,0.343125,balanced,balanced,0.33306,0.343125,0.327052
105,PCA,RF,50,5,0.192998,balanced,unbalanced,0.432471,0.192998,0.240425
109,PCA,RF,75,5,0.192059,balanced,unbalanced,0.429381,0.192059,0.237321
106,PCA,RF,50,5,0.1325,unbalanced,balanced,0.153806,0.1325,0.044201
110,PCA,RF,75,5,0.128125,unbalanced,balanced,0.140925,0.128125,0.035517


In [104]:
df.to_csv("rfnb_results1.csv")

# Conclusions

Increasing number of components more than 50 led to decrease in performance of ml models. The models succeeded in learning 'somehow' the traits of the data -- but mostly *meta* and *update* flairs. Maybe it would be good to try out to create delegator saying if the flair is meta, update or something else delegated to another model.