In [None]:
# %pip install --upgrade pip setuptools wheel
# %pip install word2vec
# %pip install gensim
# %pip install tfidf
# %pip install cleantext


In [None]:
import gensim
import pprint
import csv
from gensim import corpora
from collections import defaultdict
import spacy

In [None]:
# load data from csv file
with open('agg-cv-v2.csv', newline='') as f:
    reader = csv.reader(f)
    text_corpus = list(reader)

text_corpus_p0 = [row for row in text_corpus if any(field.strip() for field in row)]

for row in text_corpus_p0:
    print(", ".join(row))


In [None]:
print(len(text_corpus_p0))

In [None]:
# define empty list
text_corpus_p1 = []
text_corpus_p2 = []
text_corpus_p3 = []


# Call the pre-processing function 
for line in text_corpus_p0:
    # Ensure line is a string
    if isinstance(line, list):
        line = ' '.join(line)  # Join list items into a single string if needed
    text = gensim.utils.simple_preprocess(line, deacc=True, min_len=6, max_len=50)
    text_corpus_p1.append(text)

# Create list of stopwords
stoplist = set('for a of the and to in <email> <url> <phonenumber> <company>'.split(' '))

# Filter out stopwords
text_corpus_p2 = [[word for word in doc if word not in stoplist] for doc in text_corpus_p1]




In [None]:
# this code removes all words that appear only once - does it make sense? Check with predicition results.
# Count word frequencies

frequency = defaultdict(int)
for text in text_corpus_p2:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
text_corpus_p3 = [[token for token in text if frequency[token] > 3] for text in text_corpus_p2]

In [None]:
# # lemmatization

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Processed corpus
text_corpus_p3 = []

# Process each line
for sublist in text_corpus_p2:
    line = ' '.join(sublist)  # Join list elements into a single string
    doc = nlp(line)  # Process the line with SpaCy
    tokens = [token.text for token in doc]  # Extract tokens
    lemmas = [token.lemma_ for token in doc]  # Extract lemmas
    
    text_corpus_p3.append(lemmas)

In [None]:
# Print the processed corpus
for doc in text_corpus_p3:
    print(doc)


In [None]:
# create a dictionary with unique ids
dictionary = corpora.Dictionary(text_corpus_p3)
print(dictionary)

In [None]:
pprint.pprint(dictionary.token2id)

In [None]:
# # test a phrase and generate a vector

# new_doc = "excellent student physics university student bananas"
# new_vec = dictionary.doc2bow(new_doc.lower().split())
# print(new_vec)

In [None]:
bow_corpus = [dictionary.doc2bow(text) for text in text_corpus_p3]
pprint.pprint(bow_corpus)

In [None]:
# # train model to convert bow to vectors

from gensim import models

# # train the model
# tfidf = models.TfidfModel(bow_corpus)

# # transform the "system minors" string
# words = "physics student university".lower().split()
# print(tfidf[dictionary.doc2bow(words)])

In [None]:
# # generate a similarity index

# from gensim import similarities

# index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=13373)

In [None]:
# # query the similarity of a document to given string
# query_document = 'system engineering'.split()
# query_bow = dictionary.doc2bow(query_document)
# sims = index[tfidf[query_bow]]
# print(list(enumerate(sims)))

In [None]:
# # same as above, but ordered

# for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
#     print(document_number, score)

# Corpora and Vector Spaces

In [None]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
dictionary.save('all_applicants.dict')  # store the dictionary, for future reference
print(dictionary)

In [None]:
# print(dictionary.token2id)

In [None]:
# # generate vectors with tf-idf for every line in the corpus
# vector_corpus_tfidf = []

# for line in text_corpus_p3:
#     vector_corpus_tfidf.append(tfidf[dictionary.doc2bow(line)])

#     # corpus = [dictionary.doc2bow(text) for text in texts] # shorter, without tf-idf

In [None]:
# # Print the processed corpus
# for vec in vector_corpus_tfidf:
#     print(vec)


The above works in the memory. If you have many documents, you can process them one by one.

from smart_open import open  # for transparently opening remote files


class MyCorpus:
    def __iter__(self):
        for line in open('https://radimrehurek.com/mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

The full power of Gensim comes from the fact that a corpus doesn’t have to be a list, or a NumPy array, or a Pandas dataframe, or whatever. Gensim accepts any object that, when iterated over, successively yields documents.

https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html

## LDA Conversion

In [None]:
# train the lda model
# 1min pro 1000 topics
lda = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=6500)


In [None]:
# # save the model
lda.save('lda_model_6500topics.model')

In [None]:
#load the model
# lda = models.LdaModel.load('lda_model_2500topics.model')

In [None]:
# vectorize the corpus with lda probabilities
# 70 seconds for 1000 topics

vector_corpus_lda = []

for line in text_corpus_p3:
    vector_corpus_lda.append(lda[dictionary.doc2bow(line)])


In [None]:
# Print the processed corpus
for vec in vector_corpus_lda:
    print(vec)


# Convert Sparse to Dense Vectors

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

In [None]:
num_topics = lda.num_topics
print(num_topics)


In [None]:
def sparse_to_dense(sparse_vec, num_topics):
    dense_vec = np.zeros(num_topics)
    for topic_id, topic_prob in sparse_vec:
        dense_vec[topic_id] = topic_prob
    return dense_vec

dense_vectors = np.array([sparse_to_dense(doc, num_topics) for doc in vector_corpus_lda])


In [None]:
df = pd.read_csv('agg-cv-labels.csv', header=None)
labels = df.iloc[:,0].tolist()

In [None]:
print(labels)

In [None]:
print(len(labels))

# Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dense_vectors, labels, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=1000, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],  # Example additional parameter
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best CV Accuracy: {best_score}')


# Fitting 5 folds for each of 16 candidates, totalling 80 fits
# Best Parameters: {'max_depth': 10, 'n_estimators': 300}
# Best CV Accuracy: 0.6451469848421881

In [None]:
# # Scale the features
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# dense_vectors = scaler.fit_transform(dense_vectors)

# # Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(dense_vectors, labels, test_size=0.2, random_state=42)

# # Define SVM model and hyperparameter grid
# svm = SVC()
# param_grid = {
#     'C': [0.1, 1], # 10,100
#     'gamma': [1, 0.1, 0.01, 0.001],
#     'kernel': ['rbf', 'linear']
# }

# # Perform Grid Search
# grid = GridSearchCV(svm, param_grid, refit=True, verbose=2, cv=5)
# grid.fit(X_train, y_train)

# # Make Predictions and Evaluate
# y_pred = grid.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Best Parameters: {grid.best_params_}')
# print(f'Accuracy: {accuracy}')

# # Best Parameters: {'C': 1, 'gamma': 1, 'kernel': 'linear'}
# # Accuracy: 0.6427480916030535


# PCA

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px
import nbformat
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

randomseed=42

In [None]:
pca1 = PCA(n_components=100, svd_solver='randomized', random_state=randomseed) # Initialize with n_components parameter to only find the top eigenvectors
vector_corpus_lda_pca = pca1.fit_transform(dense_vectors)

In [None]:
kmeans = KMeans(n_clusters=2,n_init=100, random_state=randomseed, init='k-means++')
kmeanclusters = kmeans.fit_predict(vector_corpus_lda_pca)

In [None]:
df = pd.DataFrame(vector_corpus_lda_pca[:,0:3], columns=['PC1', 'PC2', 'PC3'])

fig = px.scatter_3d(df, x='PC1', y='PC2', z='PC3', title='3D Visualization of Top 3 Principal Components',   
                    width=1000,
                    height=800,
                    color = kmeanclusters
)

fig.update_traces(marker=dict(size=2))

fig.show()

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(vector_corpus_lda_pca, labels, test_size=0.2, random_state=42)

In [None]:
# Initialize the Logistic Regression model
model_LR = LogisticRegression(max_iter=1000)

# Fit the model to the training data
model_LR.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model_LR.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# train with selected features
log_reg_c = LogisticRegressionCV(cv=5,Cs=[0.001,0.01,0.1,1,10],max_iter=5000,penalty="l2",solver="liblinear",multi_class="ovr")
log_reg_c.fit(X_train, y_train)

In [None]:
# result train
print(log_reg_c.score(X_train, y_train))

# result test
print(log_reg_c.score(X_test, y_test))

# Classifier: XGBoost

In [None]:
# %pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(vector_corpus_lda_pca, labels, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    scale_pos_weight=1,
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')