In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

All Those features with missing values with various reasons are excluded

In [2]:
DATA_PATH = '../data/'
FEATURES_DATA = "Selected_features.csv"
RANDOM_STATE = 42

In [3]:
LF = pd.read_csv(DATA_PATH + FEATURES_DATA, low_memory=False)

In [4]:
LF.head()


Unnamed: 0,Letters_per_Word,Number_of_Types,Number_of_Words,Number_of_Words_per_Sentence,TTR,Determiners,Pronouns,First_Person_Pronouns,Negations,Positive_Words,...,reason_cause,time_sequence,Flesch_Reading_Ease,Flesch_Kincaid_Grade_Level,Gunning_Fog_Index,SMOG_Index,Coleman_Liau_Index,Dale_chall_readability,difficult_words,Linsear_write_formula
0,3.0,4,4,4,1.0,0,0,0,0,0,...,0,0,118.18,-2.3,1.6,0,-5.81,0.2,0,1.0
1,3.0,11,12,12,0.916667,2,0,0,0,0,...,0,0,110.06,0.9,4.8,0,-0.78,9.5,0,5.0
2,3.076923,12,13,13,0.923077,2,0,0,0,0,...,0,0,100.58,2.5,5.2,0,-0.31,7.93,1,5.5
3,3.076923,12,13,13,0.923077,2,0,0,0,0,...,0,0,100.58,2.5,5.2,0,-0.31,7.93,1,5.5
4,3.090909,9,11,11,0.818182,3,0,0,0,0,...,0,0,111.07,0.5,4.4,0,-0.55,7.05,0,4.5


In [5]:
LF.columns

Index(['Letters_per_Word', 'Number_of_Types', 'Number_of_Words',
       'Number_of_Words_per_Sentence', 'TTR', 'Determiners', 'Pronouns',
       'First_Person_Pronouns', 'Negations', 'Positive_Words',
       'Negative_Words', 'ARI', 'CLI', 'SPP', 'TPP', 'Swear', 'Key_conectors',
       'LIWC_pronouns', 'LIWC_psychological', 'NNP', 'VBZ', 'RP', 'NN', 'DT',
       'VBG', 'IN', 'TO', 'RB', 'JJ', 'PRP', 'VBN', 'PRP_', 'VBD', 'CC', 'VBP',
       'CD', 'RBS', 'WRB', 'VB', 'MD', 'JJR', 'EX', 'RBR', 'WP', 'WDT', 'UH',
       'PDT', 'label', 'add_info', 'contrast', 'emphasis', 'expressing_facts',
       'expressing_opinion', 'reason_cause', 'time_sequence',
       'Flesch_Reading_Ease', 'Flesch_Kincaid_Grade_Level',
       'Gunning_Fog_Index', 'SMOG_Index', 'Coleman_Liau_Index',
       'Dale_chall_readability', 'difficult_words', 'Linsear_write_formula'],
      dtype='object')

#### Features having little variability (standarddeviation close to zero) are discarded

In [6]:
numeric_data = LF.select_dtypes(include=[np.number])

In [7]:
std_devs = numeric_data.std()

In [8]:
threshold_for_std_dev = 0.01

In [9]:
exclude_features_1 = std_devs[std_devs <= threshold_for_std_dev].index.tolist()

In [10]:
exclude_features_1

['SMOG_Index']

In [11]:
mean_data = numeric_data.mean()

In [12]:
mean_threshold = 0.01

In [13]:
exclude_features_2 = mean_data[mean_data <= mean_threshold].index.tolist()

In [14]:
exclude_features_2

['Negative_Words',
 'Swear',
 'RBS',
 'EX',
 'RBR',
 'WDT',
 'UH',
 'PDT',
 'emphasis',
 'expressing_facts',
 'expressing_opinion',
 'SMOG_Index']

In [15]:
data_cleaned = LF.drop(columns=exclude_features_1 + exclude_features_2 )

In [16]:
non_numeric_columns = data_cleaned.select_dtypes(include=['object']).columns
non_numeric_columns


Index(['label'], dtype='object')

In [17]:
scaler = StandardScaler()
pca = PCA()

In [18]:
data_scaled = scaler.fit_transform(numeric_data)

In [19]:
pca.fit(data_scaled)

In [20]:
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = explained_variance_ratio.cumsum()

In [21]:
pca_summary = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance_ratio))],
    'Explained Variance Ratio': explained_variance_ratio,
    'Cumulative Variance': cumulative_variance
})

In [22]:
feature_loadings = pd.DataFrame(pca.components_, columns=numeric_data.columns).iloc[:5]

In [23]:
pca_summary.head(), feature_loadings

(  Principal Component  Explained Variance Ratio  Cumulative Variance
 0                 PC1                  0.117656             0.117656
 1                 PC2                  0.103192             0.220848
 2                 PC3                  0.046975             0.267823
 3                 PC4                  0.038618             0.306441
 4                 PC5                  0.036223             0.342664,
    Letters_per_Word  Number_of_Types  Number_of_Words  \
 0          0.212649        -0.311835        -0.316733   
 1          0.296144         0.155484         0.154887   
 2          0.102878        -0.120970        -0.115794   
 3          0.007548         0.036643         0.042831   
 4          0.076939        -0.106392        -0.110153   
 
    Number_of_Words_per_Sentence       TTR  Determiners  Pronouns  \
 0                     -0.316783  0.119613    -0.148462 -0.225757   
 1                      0.154839 -0.021773    -0.001789  0.045273   
 2                    

In [24]:
loadings = pd.DataFrame(pca.components_, columns=numeric_data.columns)
important_features = pd.DataFrame({
    'PC1': loadings.iloc[0].abs().nlargest(5).index,
    'PC2': loadings.iloc[1].abs().nlargest(5).index,
    'PC3': loadings.iloc[2].abs().nlargest(5).index,
    'PC4': loadings.iloc[3].abs().nlargest(5).index,
    'PC5': loadings.iloc[4].abs().nlargest(5).index
})

important_features

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,Number_of_Words_per_Sentence,Flesch_Kincaid_Grade_Level,Pronouns,Negations,CC
1,Number_of_Words,ARI,LIWC_pronouns,Positive_Words,add_info
2,Number_of_Types,Flesch_Reading_Ease,PRP,RB,Key_conectors
3,Linsear_write_formula,CLI,NNP,Determiners,TO
4,NNP,Coleman_Liau_Index,TPP,DT,Negations


In [25]:
unique_features = set(important_features.PC1) | set(important_features.PC2) | \
                  set(important_features.PC3) | set(important_features.PC4) | \
                  set(important_features.PC5)


In [26]:
reduced_data = numeric_data[list(unique_features)]

In [27]:
reduced_data.head()

Unnamed: 0,Negations,Pronouns,Key_conectors,NNP,Number_of_Words,Number_of_Types,add_info,ARI,LIWC_pronouns,TPP,...,PRP,Number_of_Words_per_Sentence,Flesch_Reading_Ease,CC,DT,Positive_Words,CLI,TO,Coleman_Liau_Index,Linsear_write_formula
0,0,0,0,3,4,4,0,-5.4,0,0,...,0,4,118.18,0,0,0,-5.81,0,-5.81,1.0
1,0,0,0,4,12,11,0,-1.4,0,0,...,0,12,110.06,0,2,0,-0.78,2,-0.78,5.0
2,0,0,1,3,13,12,1,-0.5,0,0,...,0,13,100.58,1,2,0,-0.31,0,-0.31,5.5
3,0,0,1,3,13,12,1,-0.5,0,0,...,0,13,100.58,1,2,0,-0.31,0,-0.31,5.5
4,0,0,0,2,11,9,0,-1.5,0,0,...,0,11,111.07,0,3,0,-0.55,0,-0.55,4.5


In [28]:
Significant_features = reduced_data.copy()
Significant_features['label'] = LF['label']

Significant_features.head()

Unnamed: 0,Negations,Pronouns,Key_conectors,NNP,Number_of_Words,Number_of_Types,add_info,ARI,LIWC_pronouns,TPP,...,Number_of_Words_per_Sentence,Flesch_Reading_Ease,CC,DT,Positive_Words,CLI,TO,Coleman_Liau_Index,Linsear_write_formula,label
0,0,0,0,3,4,4,0,-5.4,0,0,...,4,118.18,0,0,0,-5.81,0,-5.81,1.0,fake
1,0,0,0,4,12,11,0,-1.4,0,0,...,12,110.06,0,2,0,-0.78,2,-0.78,5.0,real
2,0,0,1,3,13,12,1,-0.5,0,0,...,13,100.58,1,2,0,-0.31,0,-0.31,5.5,fake
3,0,0,1,3,13,12,1,-0.5,0,0,...,13,100.58,1,2,0,-0.31,0,-0.31,5.5,fake
4,0,0,0,2,11,9,0,-1.5,0,0,...,11,111.07,0,3,0,-0.55,0,-0.55,4.5,real


In [33]:
Significant_features.columns

Index(['Negations', 'Pronouns', 'Key_conectors', 'NNP', 'Number_of_Words',
       'Number_of_Types', 'add_info', 'ARI', 'LIWC_pronouns', 'TPP',
       'Determiners', 'Flesch_Kincaid_Grade_Level', 'RB', 'PRP',
       'Number_of_Words_per_Sentence', 'Flesch_Reading_Ease', 'CC', 'DT',
       'Positive_Words', 'CLI', 'TO', 'Coleman_Liau_Index',
       'Linsear_write_formula', 'label'],
      dtype='object')

In [29]:
DATA_PATH = "../data/"
FILENAME = 'Significant_features.csv'

In [30]:
file_path = DATA_PATH + FILENAME

In [31]:
Significant_features.to_csv(DATA_PATH + FILENAME, index=False)