In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the Preprocessed Data

In [None]:
# Load the preprocessed data
data = pd.read_csv('/content/drive/MyDrive/SMA_Project/raw/combined_data_preprocessed.csv')

# Display the first few rows
data.head()

Unnamed: 0,label,statement,subject,speaker,speaker_job,speaker_state,speaker_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,cleaned_statement,tokenized_statement
0,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,say anies list political group suports thirdtr...,"['say', 'anies', 'list', 'political', 'group',..."
1,True,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,decline coal start started natural gas tok sta...,"['decline', 'coal', 'start', 'started', 'natur..."
2,True,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,hilary clinton agres john mcain voting give ge...,"['hilary', 'clinton', 'agres', 'john', 'mcain'..."
3,False,Health care reform legislation is likely to ma...,health-care,blog-posting,Unknown,Unknown,none,7.0,19.0,3.0,5.0,44.0,a news release,health care reform legislation likely mandate ...,"['health', 'care', 'reform', 'legislation', 'l..."
4,True,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,Unknown,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,economic turnaround started end term,"['economic', 'turnaround', 'started', 'end', '..."


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Columns to analyze variance
columns_to_analyze = ['subject', 'speaker', 'speaker_job', 'speaker_state', 'speaker_affiliation']

# Iterate through specified columns
for col in columns_to_analyze:
    if data[col].dtype == 'object':
        # Label Encoding for object columns
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

    # Calculate variance
    variance = data[col].var()
    print(f"Variance of {col}: {variance}")

Variance of subject: 1722129.0859813183
Variance of speaker: 924580.4444260062
Variance of speaker_job: 106306.0625097932
Variance of speaker_state: 525.9215218365573
Variance of speaker_affiliation: 40.080497637387644


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Columns to analyze variance
columns_to_analyze = ['subject', 'speaker', 'speaker_job', 'speaker_state', 'speaker_affiliation']

# Iterate through specified columns
for col in columns_to_analyze:
    if data[col].dtype == 'object':
        # Label Encoding for object columns
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Columns for VIF analysis
columns_for_vif = ['subject', 'speaker', 'speaker_job', 'speaker_state', 'speaker_affiliation']
X = data[columns_for_vif]

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

# Identify features with VIF < 3
low_vif_features = vif_data[vif_data["VIF"] < 3]["feature"].tolist()
high_vif_features = vif_data[vif_data["VIF"] >= 3]["feature"].tolist()

# Create new dataframes with low and high vif features
X_low_vif = X[low_vif_features]
X_high_vif = X[high_vif_features]

print("\nFeatures with low multicollinearity (VIF < 3):")
print(X_low_vif.head())

print("\nFeatures with high multicollinearity (VIF >= 3):")
print(X_high_vif.head())


Features with low multicollinearity (VIF < 3):
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]

Features with high multicollinearity (VIF >= 3):
   subject  speaker  speaker_job  speaker_state  speaker_affiliation
0       35      915          993             62                   21
1     3057     2829          990             69                    7
2     3425      211          732             20                    7
3     3667      310         1094             64                   18
4     2603      487         1094             13                    7


# 2. Feature Representation

## 2.1 Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
bow_vectorizer = CountVectorizer()

# Fit and transform the cleaned statements
bow_features = bow_vectorizer.fit_transform(data['cleaned_statement'])

# Display the shape of the BoW features
print("Bag of Words Features Shape:", bow_features.shape)


Bag of Words Features Shape: (12790, 12196)


## 2.2 Term Frequency-Inverse Document Frequency (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned statements
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_statement'])

# Display the shape of the TF-IDF features
print("TF-IDF Features Shape:", tfidf_features.shape)

TF-IDF Features Shape: (12790, 12196)


## 2.3 GloVe Embeddings

In [None]:
import gensim.downloader as api

# Load the GloVe model
print("Loading GloVe model...")
glove_model = api.load("glove-twitter-100")
print("GloVe model loaded.")

# Function to get GloVe embeddings
def get_glove_embedding(text):
    tokens = text.split()
    embeddings = [glove_model[word] for word in tokens if word in glove_model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(100)  # Assuming the GloVe vectors are of size 100

# Apply the function to the cleaned statements
data['glove_embedding'] = data['cleaned_statement'].apply(get_glove_embedding)

# Stack the embeddings into a 2D array
glove_features = np.vstack(data['glove_embedding'].values)

# Display the shape of the GloVe features
print("GloVe Features Shape:", glove_features.shape)


Loading GloVe model...
GloVe model loaded.
GloVe Features Shape: (12790, 100)


# 3. Feature Selection

## 3.1. Preparing the Target Variable

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])


## 3.2. Selecting Top 100 Features Using Chi-Square Test

### 3.2.1. For Bag of Words

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

# Perform chi-square test
chi2_selector = SelectKBest(chi2, k=100)
bow_kbest = chi2_selector.fit_transform(bow_features, y)

# Get the selected feature names
bow_feature_names = bow_vectorizer.get_feature_names_out()
bow_selected_indices = chi2_selector.get_support(indices=True)
bow_selected_features = [bow_feature_names[i] for i in bow_selected_indices]

print("Top 100 features selected from Bag of Words:")
print(bow_selected_features)

Top 100 features selected from Bag of Words:
['almost', 'american', 'among', 'average', 'barack', 'benghazi', 'bety', 'botom', 'bulb', 'bureaucrat', 'care', 'check', 'christian', 'clinton', 'colege', 'coruption', 'country', 'day', 'debt', 'decade', 'deciding', 'dufy', 'every', 'face', 'family', 'firearm', 'georgia', 'going', 'gov', 'graduation', 'group', 'grown', 'half', 'health', 'highest', 'hilary', 'hurt', 'income', 'increased', 'last', 'le', 'light', 'loses', 'lowest', 'manufacturing', 'milion', 'month', 'murphy', 'muslim', 'nation', 'nearly', 'obama', 'obamacare', 'obamas', 'percent', 'plan', 'poverty', 'prayer', 'president', 'price', 'radical', 'rate', 'rep', 'richest', 'say', 'scheme', 'scot', 'sean', 'security', 'senior', 'since', 'social', 'socialist', 'spending', 'spends', 'stadium', 'state', 'stil', 'stimulus', 'story', 'suports', 'suton', 'takeover', 'tery', 'texas', 'thre', 'time', 'today', 'told', 'top', 'wage', 'walker', 'weve', 'white', 'wil', 'wisconsin', 'world', 'wor

### 3.2.2. For TF-IDF

In [None]:
# Perform chi-square test
chi2_selector = SelectKBest(chi2, k=100)
tfidf_kbest = chi2_selector.fit_transform(tfidf_features, y)

# Get the selected feature names
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_selected_indices = chi2_selector.get_support(indices=True)
tfidf_selected_features = [tfidf_feature_names[i] for i in tfidf_selected_indices]

print("Top 100 features selected from TF-IDF:")
print(tfidf_selected_features)


Top 100 features selected from TF-IDF:
['almost', 'american', 'average', 'barack', 'benghazi', 'bety', 'botom', 'bulb', 'bureaucrat', 'capandtrade', 'care', 'check', 'clinton', 'colege', 'congresman', 'coruption', 'country', 'cucineli', 'day', 'debt', 'decade', 'deciding', 'destroyed', 'doctor', 'dufy', 'every', 'everybody', 'face', 'firearm', 'georgia', 'going', 'governmentrun', 'graduation', 'group', 'grown', 'half', 'health', 'highest', 'hilary', 'hurt', 'husein', 'ilegals', 'incarceration', 'income', 'increased', 'last', 'loses', 'lowest', 'manufacturing', 'martin', 'mcaulife', 'mental', 'milion', 'month', 'murphy', 'muslim', 'nation', 'obama', 'obamacare', 'obamas', 'ohio', 'percent', 'plan', 'poverty', 'president', 'rate', 'rep', 'richest', 'say', 'scheme', 'scot', 'sean', 'security', 'since', 'social', 'socialist', 'spending', 'spends', 'stadium', 'state', 'stil', 'stimulus', 'story', 'suports', 'suton', 'takeover', 'tery', 'th', 'thre', 'time', 'today', 'top', 'torture', 'victo

## 3.3 Add VIF Features After Chi-Square Feature Selection

### 3.3.1 Prepare VIF Features

In [None]:
# Combine low and high VIF features if desired
X_vif_combined = pd.concat([X_low_vif], axis=1)

# Ensure the indices align
X_vif_combined.reset_index(drop=True, inplace=True)


### 3.3.2 Add VIF Features to Selected Features

In [None]:
# For BoW Selected Features

# Create a DataFrame with selected BoW features
bow_selected_df = pd.DataFrame(bow_kbest.toarray(), columns=bow_selected_features)

# Reset indices
bow_selected_df.reset_index(drop=True, inplace=True)

# Concatenate the VIF features with the selected BoW features
bow_selected_df = pd.concat([bow_selected_df, X_vif_combined], axis=1)

# Display the shape
print("BoW Features Shape after adding VIF features:", bow_selected_df.shape)


# For TF-IDF Selected Features
# Create a DataFrame with selected TF-IDF features
tfidf_selected_df = pd.DataFrame(tfidf_kbest.toarray(), columns=tfidf_selected_features)

# Reset indices
tfidf_selected_df.reset_index(drop=True, inplace=True)

# Concatenate the VIF features with the selected TF-IDF features
tfidf_selected_df = pd.concat([tfidf_selected_df, X_vif_combined], axis=1)

# Display the shape
print("TF-IDF Features Shape after adding VIF features:", tfidf_selected_df.shape)

# For GloVe Features

# Create a DataFrame with GloVe embeddings
glove_df = pd.DataFrame(glove_features)

# Reset indices
glove_df.reset_index(drop=True, inplace=True)

# Concatenate the VIF features with the GloVe embeddings
glove_df = pd.concat([glove_df, X_vif_combined], axis=1)

# Display the shape
print("GloVe Features Shape after adding VIF features:", glove_df.shape)
glove_df.shape


BoW Features Shape after adding VIF features: (12790, 100)
TF-IDF Features Shape after adding VIF features: (12790, 100)
GloVe Features Shape after adding VIF features: (12790, 100)


  glove_df = pd.concat([glove_df, X_vif_combined], axis=1)


(12790, 100)

In [None]:
glove_df.shape

(12790, 100)

# 4. Save Processed Data for Model Training

## 4.1 Saving BoW Selected Features

In [None]:
# Save to CSV
bow_selected_df.to_csv('/content/drive/MyDrive/SMA_Project/raw/bow_selected_features.csv', index=False)


## 4.2 Saving TF-IDF Selected Features

In [None]:
# Save to CSV
tfidf_selected_df.to_csv('/content/drive/MyDrive/SMA_Project/raw/tfidf_selected_features.csv', index=False)


## 4.3 Saving GloVe Embeddings

In [None]:
# Save to CSV
glove_df.to_csv('/content/drive/MyDrive/SMA_Project/raw/glove_features.csv', index=False)


## 4.4 Saving Labels

In [None]:
# Save labels
labels_df = pd.DataFrame({'label': y})

# Save to CSV
labels_df.to_csv('/content/drive/MyDrive/SMA_Project/raw/labels.csv', index=False)


In [None]:
labels_df.head()

Unnamed: 0,label
0,0
1,1
2,1
3,0
4,1
