In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
#Read in files and train test split
x = pd.read_csv('x.csv')
y = pd.read_csv('y.csv')
xTrain,xTest,yTrain,yTest = train_test_split(x, y, train_size=0.7, shuffle=True)
xTrain.to_csv('xTrain.csv', index=False)
xTest.to_csv('xTest.csv', index=False)
yTrain.to_csv('yTrain.csv', index=False)
yTest.to_csv('yTest.csv', index=False)

In [6]:
#Tranform data using bag of words
from sklearn.feature_extraction.text import CountVectorizer

xTrain = pd.read_csv('xTrain.csv')
xTest = pd.read_csv('xTest.csv')

vectorizer = CountVectorizer(max_features=500)
vectorizer.fit(xTrain['cleaned_content'])
bow_df_train = vectorizer.transform(xTrain['cleaned_content'])
bow_df_test = vectorizer.transform(xTest['cleaned_content'])
feature_names = vectorizer.get_feature_names_out()
bow_df_train = pd.DataFrame(bow_df_train.toarray(), columns=feature_names)
bow_df_test = pd.DataFrame(bow_df_test.toarray(), columns=feature_names)
bow_df_train.to_csv('bow_df_train.csv', index=False)
bow_df_test.to_csv('bow_df_test.csv', index=False)

In [None]:
#Tranform data using binary
from sklearn.feature_extraction.text import CountVectorizer

xTrain = pd.read_csv('xTrain.csv')
xTest = pd.read_csv('xTest.csv')

vectorizer = CountVectorizer(max_features=500, binary=True)
vectorizer.fit(xTrain['cleaned_content'])
binary_df_train = vectorizer.transform(xTrain['cleaned_content'])
binary_df_test = vectorizer.transform(xTest['cleaned_content'])
feature_names = vectorizer.get_feature_names_out()
binary_df_train = pd.DataFrame(binary_df_train.toarray(), columns=feature_names)
binary_df_test = pd.DataFrame(binary_df_test.toarray(), columns=feature_names)
binary_df_train.to_csv('binary_df_train.csv', index=False)
binary_df_test.to_csv('binary_df_test.csv', index=False)

In [5]:
#Transform data using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

xTrain = pd.read_csv('xTrain.csv')
xTest = pd.read_csv('xTest.csv')

vectorizer = TfidfVectorizer(max_features=500)
vectorizer.fit(xTrain['cleaned_content'])
tfidf_df_train = vectorizer.transform(xTrain['cleaned_content'])
tfidf_df_test = vectorizer.transform(xTest['cleaned_content'])
feature_names = vectorizer.get_feature_names_out()
tfidf_df_train = pd.DataFrame(tfidf_df_train.toarray(), columns=feature_names)
tfidf_df_test = pd.DataFrame(tfidf_df_test.toarray(), columns=feature_names)
tfidf_df_train.to_csv('tfidf_df_train.csv', index=False)
tfidf_df_test.to_csv('tfidf_df_test.csv', index=False)

In [5]:
#Transform data using hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer

xTrain = pd.read_csv('xTrain.csv')
xTest = pd.read_csv('xTest.csv')

vectorizer = HashingVectorizer(n_features=500)
vectorizer.fit(xTrain['cleaned_content'])
hash_df_train = vectorizer.transform(xTrain['cleaned_content'])
hash_df_test = vectorizer.transform(xTest['cleaned_content'])
hash_df_train = pd.DataFrame(hash_df_train.toarray())
hash_df_test = pd.DataFrame(hash_df_test.toarray())
hash_df_train.to_csv('hash_df_train.csv', index=False)
hash_df_test.to_csv('hash_df_test.csv', index=False)

In [None]:
#Transform data using non-negative matrix factorization (NMF)
from sklearn.decomposition import NMF

tfidf_df_train = pd.read_csv('tfidf_df_train.csv')
tfidf_df_test = pd.read_csv('tfidf_df_test.csv')

nmf = NMF(n_components=10, random_state=100)
nmf.fit(tfidf_df_train)
nmf_df_train = nmf.transform(tfidf_df_train)
nmf_df_test = nmf.transform(tfidf_df_test)
nmf_df_train = pd.DataFrame(nmf_df_train)
nmf_df_test = pd.DataFrame(nmf_df_test)
nmf_df_train.to_csv('nmf_df_train.csv', index=False)
nmf_df_test.to_csv('nmf_df_test.csv', index=False)

In [6]:
# Extracting topic/features using LDA

import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation

# Assuming that 'tfidf_df_train.csv' and 'tfidf_df_test.csv' contain the TF-IDF vectors
tfidf_df_train = pd.read_csv('tfidf_df_train.csv')
tfidf_df_test = pd.read_csv('tfidf_df_test.csv')

# Applying LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)

# Fit LDA on the training data
X_lda_train = lda.fit_transform(tfidf_df_train)

# Transform the test data using the fitted LDA
X_lda_test = lda.transform(tfidf_df_test)

# Convert the LDA-transformed data to DataFrames for saving into CSV files
lda_df_train = pd.DataFrame(X_lda_train)
lda_df_test = pd.DataFrame(X_lda_test)

# Save the LDA features to CSV files
lda_df_train.to_csv('lda_df_train.csv', index=False)
lda_df_test.to_csv('lda_df_test.csv', index=False)



In [7]:
# PCA

import pandas as pd
from sklearn.decomposition import PCA

# Read the already transformed TF-IDF data
tfidf_df_train = pd.read_csv('tfidf_df_train.csv')
tfidf_df_test = pd.read_csv('tfidf_df_test.csv')

# Initialize PCA, choose the number of components you want, for example, 10
pca = PCA(n_components=10, random_state=42)

# Fit PCA on the training data and transform it
pca_df_train = pca.fit_transform(tfidf_df_train)

# Transform the test data using the already fitted PCA
pca_df_test = pca.transform(tfidf_df_test)

# Convert the PCA results to DataFrame to save them into CSV files
pca_df_train = pd.DataFrame(pca_df_train)
pca_df_test = pd.DataFrame(pca_df_test)

# Save the PCA features into CSV files
pca_df_train.to_csv('pca_df_train.csv', index=False)
pca_df_test.to_csv('pca_df_test.csv', index=False)
