# Install All necessary Packages

In [1]:
!pip install gensim




In [2]:
pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
from numpy import triu  # Correct import for triu
from scipy.linalg import lu
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

print("Numpy, Scipy, Gensim, Matplotlib, and Scikit-learn imported successfully.")


Numpy, Scipy, Gensim, Matplotlib, and Scikit-learn imported successfully.


In [4]:
import pandas as pd
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

print("All imports successful.")


All imports successful.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshikhaagarwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshikhaagarwal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/harshikhaagarwal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Set Up Directory 

In [5]:
# Define the file path
file_path = 'final_sample.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
num_rows = df.shape[0]

print(f'The number of rows excluding the column names is: {num_rows}')
print(df.columns)


The number of rows excluding the column names is: 5000
Index(['pgpub_id', 'application_id', 'filing_date', 'patent_type',
       'filing_type', 'published_date', 'wipo_kind', 'series_code',
       'application_title', 'application_abstract', 'rule_47_flag', 'filename',
       'rel_app_text', 'patent_id', 'current_pgpub_id_flag',
       'current_patent_id_flag', 'uspc_sequence', 'uspc_mainclass_id',
       'uspc_mainclass_title', 'uspc_subclass_id', 'uspc_subclass_title',
       'published_or_filed_date', 'pct_371_date', 'pct_102_date',
       'filed_country', 'application_kind', 'pct_doc_number', 'pct_doc_type',
       'wipo_field_sequence', 'wipo_field_id', 'wipo_sector_title',
       'wipo_field_title', 'priority_claim_sequence', 'priority_claim_kind',
       'foreign_application_id', 'foreign_filing_date',
       'foreign_country_filed', 'cpc_sequence', 'cpc_section', 'cpc_class',
       'cpc_subclass', 'cpc_group', 'cpc_type', 'cpc_version_indicator',
       'cpc_subclass_title', '

In [6]:
pgpub_id_unique = df['pgpub_id'].is_unique

# Print the result
if pgpub_id_unique:
    print("All pgpub_id values are unique.")
else:
    print("There are duplicate pgpub_id values.")

All pgpub_id values are unique.


In [7]:
df['approval_status'] = np.where(df['patent_id'].isnull(), 0, 1)
print(df['approval_status'].head())

0    1
1    0
2    1
3    0
4    1
Name: approval_status, dtype: int64


# Clean and Pre-Process the Text Variable

In [8]:
# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    #lemmatizer = WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply preprocessing to application_abstract column
df['application_abstract'] = df['application_abstract'].apply(preprocess_text)

# Apply preprocessing to rel_app_text column
df['rel_app_text'] = df['rel_app_text'].apply(preprocess_text)

# Calculate total number of unique words in the entire dataset
total_unique_words = len(set(' '.join(df['application_abstract']).split()))

# Calculate average number of unique words per patent application
average_unique_words_per_application = total_unique_words / len(df)

# Tokenize each abstract and count the number of words for each patent application
df['word_count'] = df['application_abstract'].apply(lambda x: len(x.split()))

# Calculate average number of words for every patent application
average_words_per_application = df['word_count'].mean()

print("Total unique words in the entire dataset:", total_unique_words)
print("Average number of words in patent application abstract:", average_words_per_application)
print("Average unique words per patent application abstract:", average_unique_words_per_application)

Total unique words in the entire dataset: 8829
Average number of words in patent application abstract: 61.994
Average unique words per patent application abstract: 1.7658


# Split Data into Train and Test

In [9]:
# Splitting the data into features (X) and target (y)
X = df.drop(['approval_status'], axis=1)  
y = df['approval_status'] 

# Performing a universal split on the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=50)

In [10]:
from sklearn.utils import resample

# Upsample minority class
X_minority_upsampled, y_minority_upsampled = resample(X_train[y_train == 0],  # Selecting minority class samples
                                                      y_train[y_train == 0],  # Selecting corresponding labels
                                                      replace=True,           # Sampling with replacement
                                                      n_samples=sum(y_train == 1),  # Number of samples from majority class
                                                      random_state=25)        # Set random state for reproducibility

# Combine majority class with upsampled minority class
X_train_upsampled = pd.concat([X_train[y_train == 1], X_minority_upsampled])
y_train_upsampled = pd.concat([y_train[y_train == 1], y_minority_upsampled])

In [11]:
X_train = X_train_upsampled
y_train = y_train_upsampled

# Part 1

# Base Line Model: BOW and TFIDF Vectorized Text Features and Cosine-Similarity Similarity Score with Logistic Regression

# Create a Unified Dictionary for Vector Representation of Text Variable

# Bag of Words Vectorization Approach

In [12]:
# Combine the abstract and related application text for vocabulary fitting
combined_text = X_train['application_abstract'] + ' ' + X_train['rel_app_text']
   
# Fit the vectorizer on the combined text
vectorizer0 = CountVectorizer()
    
# Step 1: Bag of Words Vectorization
X_train_abstract_bow = vectorizer0.fit_transform(X_train['application_abstract'])
X_train_rel_app_text_bow = vectorizer0.transform(X_train['rel_app_text'])
X_test_abstract_bow = vectorizer0.transform(X_test['application_abstract'])
X_test_rel_app_text_bow = vectorizer0.transform(X_test['rel_app_text'])

    

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
# Step 2: Calculate Cosine Similarity
cosine_similarity_train_bow = [cosine_similarity(X_train_abstract_bow[i], X_train_rel_app_text_bow[i])[0, 0] for i in range(X_train_abstract_bow.shape[0])]
cosine_similarity_test_bow = [cosine_similarity(X_test_abstract_bow[i], X_test_rel_app_text_bow[i])[0, 0] for i in range(X_test_abstract_bow.shape[0])]

# Add cosine similarity to the dataframes
X_train['cosine_similarity_bow'] = cosine_similarity_train_bow
X_test['cosine_similarity_bow'] = cosine_similarity_test_bow

print(X_train['cosine_similarity_bow'].head())
print(X_test['cosine_similarity_bow'].head())

1253    0.033113
3725    0.000000
116     0.000000
1199    0.039757
1523    0.000000
Name: cosine_similarity_bow, dtype: float64
3431    0.104524
2042    0.000000
79      0.000000
4663    0.101710
3640    0.751960
Name: cosine_similarity_bow, dtype: float64


In [14]:

  # Apply Fisher transformation
def fisher_transform(cosine_similarities_bow):
    return [0.5 * np.log((1 + sim) / (1 - sim)) for sim in cosine_similarities_bow]

fisher_cosine_similarity_train_bow = fisher_transform(cosine_similarity_train_bow)
fisher_cosine_similarity_test_bow = fisher_transform(cosine_similarity_test_bow)

# Add Fisher-transformed cosine similarity to the dataframes
X_train['fisher_cosine_similarity_bow'] = fisher_cosine_similarity_train_bow
X_test['fisher_cosine_similarity_bow'] = fisher_cosine_similarity_test_bow

print(X_train['fisher_cosine_similarity_bow'].head())
print(X_test['fisher_cosine_similarity_bow'].head())


1253    0.033125
3725    0.000000
116     0.000000
1199    0.039778
1523    0.000000
Name: fisher_cosine_similarity_bow, dtype: float64
3431    0.104907
2042    0.000000
79      0.000000
4663    0.102062
3640    0.977450
Name: fisher_cosine_similarity_bow, dtype: float64


# TFIDF Vector Representation Approach

In [15]:
# Combine the abstract and related application text for vocabulary fitting
combined_text = X_train['application_abstract'] + ' ' + X_train['rel_app_text']

# Fit the vectorizer on the combined text
vectorizer1 = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
vectorizer1.fit(combined_text)

# Transform the abstract and related application text separately using the fitted vectorizer
X_train_abstract_tfidf = vectorizer1.transform(X_train['application_abstract'])
X_test_abstract_tfidf = vectorizer1.transform(X_test['application_abstract'])
X_train_rel_app_text_tfidf = vectorizer1.transform(X_train['rel_app_text'])
X_test_rel_app_text_tfidf = vectorizer1.transform(X_test['rel_app_text'])


# Base Line Model: TFIDF Vectorized Text Features and Cosine-Similarity Similarity Score with Logistic Regression

## Calculate Cosine Similarity

In [16]:
# Calculate cosine similarity for training data
cosine_similarity_train_tfidf = [cosine_similarity(X_train_abstract_tfidf[i], X_train_rel_app_text_tfidf[i])[0, 0] for i in range(X_train_abstract_tfidf.shape[0])]

# Calculate cosine similarity for test data
cosine_similarity_test_tfidf = [cosine_similarity(X_test_abstract_tfidf[i], X_test_rel_app_text_tfidf[i])[0, 0] for i in range(X_test_abstract_tfidf.shape[0])]

# Add cosine similarity to the dataframes
X_train['cosine_similarity_tfidf'] = cosine_similarity_train_tfidf
X_test['cosine_similarity_tfidf'] = cosine_similarity_test_tfidf

print(X_train['cosine_similarity_tfidf'].head())
print(X_test['cosine_similarity_tfidf'].head())

1253    0.006685
3725    0.000000
116     0.000000
1199    0.003859
1523    0.000000
Name: cosine_similarity_tfidf, dtype: float64
3431    0.005460
2042    0.000000
79      0.000000
4663    0.129988
3640    0.708633
Name: cosine_similarity_tfidf, dtype: float64


In [17]:
# Apply Fisher transformation to cosine similarities
def fisher_transform(cosine_similarities_tfidf):
    return [0.5 * np.log((1 + sim) / (1 - sim)) for sim in cosine_similarities_tfidf]

fisher_cosine_similarity_train_tfidf = fisher_transform(cosine_similarity_train_tfidf)
fisher_cosine_similarity_test_tfidf= fisher_transform(cosine_similarity_test_tfidf)

# Add Fisher-transformed cosine similarity to the dataframes
X_train['fisher_cosine_similarity_tfidf'] = fisher_cosine_similarity_train_tfidf
X_test['fisher_cosine_similarity_tfidf'] = fisher_cosine_similarity_test_tfidf

print(X_train['fisher_cosine_similarity_tfidf'].head())
print(X_test['fisher_cosine_similarity_tfidf'].head())


1253    0.006685
3725    0.000000
116     0.000000
1199    0.003859
1523    0.000000
Name: fisher_cosine_similarity_tfidf, dtype: float64
3431    0.005460
2042    0.000000
79      0.000000
4663    0.130728
3640    0.884433
Name: fisher_cosine_similarity_tfidf, dtype: float64


# Part 2: Modelling 

# Define Model Features

In [18]:
# Model 0: BOW Abstract as the only feature
X_train0 = X_train_abstract_bow.toarray()
X_test0 = X_test_abstract_bow.toarray()

# Model 1: Combine BOW features from Abstract and Related Application Text
X_train1 = np.hstack((X_train_abstract_bow.toarray(), X_train_rel_app_text_bow.toarray()))
X_test1 = np.hstack((X_test_abstract_bow.toarray(), X_test_rel_app_text_bow.toarray()))

# Model 2: BOW abstract, BOW Related Application text, and Cosine Similarity
# Combine TF-IDF features with cosine similarity
X_train2 = np.hstack((X_train_abstract_bow.toarray(), X_train_rel_app_text_bow.toarray(), np.array(cosine_similarity_train_bow).reshape(-1, 1)))
X_test2 = np.hstack((X_test_abstract_bow.toarray(), X_test_rel_app_text_bow.toarray(), np.array(cosine_similarity_test_bow).reshape(-1, 1)))

# Model 3: BOW abstract, BOW Related Application text, and Fisher Cosine Similarity
# I just choose Fisher Transformation Randomly because im an Economist, duh!
X_train3 = np.hstack((X_train_abstract_bow.toarray(), X_train_rel_app_text_bow.toarray(), np.array(fisher_cosine_similarity_train_bow).reshape(-1, 1)))
X_test3 = np.hstack((X_test_abstract_bow.toarray(), X_test_rel_app_text_bow.toarray(), np.array(fisher_cosine_similarity_test_bow).reshape(-1, 1)))

# Model 4: TFIDF Abstract as the only feature
X_train4 = X_train_abstract_tfidf.toarray()
X_test4 = X_test_abstract_tfidf.toarray()

# Model 5: Combine TF-IDF features from Abstract and Related Application Text
X_train5 = np.hstack((X_train_abstract_tfidf.toarray(), X_train_rel_app_text_tfidf.toarray()))
X_test5 = np.hstack((X_test_abstract_tfidf.toarray(), X_test_rel_app_text_tfidf.toarray()))

# Model 6: TFIDF abstract, TFIDF Related Application text, and Cosine Similarity
# Combine TF-IDF features with cosine similarity
X_train6 = np.hstack((X_train_abstract_tfidf.toarray(), X_train_rel_app_text_tfidf.toarray(), np.array(cosine_similarity_train_tfidf).reshape(-1, 1)))
X_test6 = np.hstack((X_test_abstract_tfidf.toarray(), X_test_rel_app_text_tfidf.toarray(), np.array(cosine_similarity_test_tfidf).reshape(-1, 1)))

# Model 7: TFIDF abstract, TFIDF Related Application text, and Fisher Cosine Similarity
# I just choose Fisher Transformation Randomly because im an Economist, duh!
X_train7 = np.hstack((X_train_abstract_tfidf.toarray(), X_train_rel_app_text_tfidf.toarray(), np.array(fisher_cosine_similarity_train_tfidf).reshape(-1, 1)))
X_test7 = np.hstack((X_test_abstract_tfidf.toarray(), X_test_rel_app_text_tfidf.toarray(), np.array(fisher_cosine_similarity_test_tfidf).reshape(-1, 1)))


# Logistic Regression Classifier

In [19]:
def train_logistic_regression_classifier(X_train, X_test, y_train, y_test):
    # Define Logistic Regression Model and Parameter Grid
    model = LogisticRegression()
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear'],
        'max_iter': [100, 200, 300]
    }

    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the Best Model and Evaluate
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Evaluate the Model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

    return best_model, accuracy, report

# Example usage
# X_train, X_test, y_train, y_test should be defined with your actual data
# best_model, accuracy, report = train_logistic_regression_classifier(X_train, X_test, y_train, y_test)


In [20]:
best_modelL0, accuracyL0, reportL0 = train_logistic_regression_classifier(X_train0, X_test0, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.571
Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.39      0.38       669
           1       0.68      0.66      0.67      1331

    accuracy                           0.57      2000
   macro avg       0.53      0.53      0.53      2000
weighted avg       0.58      0.57      0.57      2000



In [21]:
best_modelL1, accuracyL1, reportL1 = train_logistic_regression_classifier(X_train1, X_test1, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.578
Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.39      0.38       669
           1       0.69      0.67      0.68      1331

    accuracy                           0.58      2000
   macro avg       0.53      0.53      0.53      2000
weighted avg       0.58      0.58      0.58      2000



In [22]:
best_modelL2, accuracyL2, reportL2 = train_logistic_regression_classifier(X_train2, X_test2, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.5775
Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.39      0.38       669
           1       0.69      0.67      0.68      1331

    accuracy                           0.58      2000
   macro avg       0.53      0.53      0.53      2000
weighted avg       0.58      0.58      0.58      2000



In [23]:
best_modelL3, accuracyL3, reportL3 = train_logistic_regression_classifier(X_train3, X_test3, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.5775
Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.39      0.38       669
           1       0.69      0.67      0.68      1331

    accuracy                           0.58      2000
   macro avg       0.53      0.53      0.53      2000
weighted avg       0.58      0.58      0.58      2000



In [24]:
best_modelL4, accuracyL4, reportL4 = train_logistic_regression_classifier(X_train4, X_test4, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.6175
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.31      0.35       669
           1       0.69      0.77      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.55      0.54      0.54      2000
weighted avg       0.60      0.62      0.60      2000



In [25]:
best_modelL5, accuracyL5, reportL5 = train_logistic_regression_classifier(X_train5, X_test5, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.622
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.33      0.37       669
           1       0.70      0.77      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.56      0.55      0.55      2000
weighted avg       0.60      0.62      0.61      2000



In [26]:
best_modelL6, accuracyL6, reportL6 = train_logistic_regression_classifier(X_train6, X_test6, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.33      0.37       669
           1       0.70      0.76      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.56      0.55      0.55      2000
weighted avg       0.60      0.62      0.61      2000



In [27]:
best_modelL7, accuracyL7, reportL7 = train_logistic_regression_classifier(X_train7, X_test7, y_train, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.6205
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.33      0.37       669
           1       0.70      0.76      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.56      0.55      0.55      2000
weighted avg       0.60      0.62      0.61      2000



# Random Forest Classifier Implementation

In [28]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint

def train_random_forest_classifier(X_train, X_test, y_train, y_test):
    
    # Define Random Forest Model and Parameter Grid
    model = RandomForestClassifier(random_state=42)
    param_dist = {
        'n_estimators': randint(10, 25, 50),
        'max_depth': randint(5, 10, 15),
        'min_samples_split': randint(2, 3, 6),
        'min_samples_leaf': randint(1, 2, 3)
    }

    # Cross-validation strategy
    cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                       n_iter=50, cv=cv_strategy, scoring='accuracy', 
                                       verbose=2, n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Get the Best Model and Evaluate
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Evaluate the Model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
    return best_model, accuracy, report

# Example usage
# best_model, accuracy, report = train_random_forest_classifier(X_train, X_test, y_train, y_test)


In [29]:
# Random Forest Model 0
best_modelR0, accuracyR0, reportR0 = train_random_forest_classifier(X_train0, X_test0, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 24, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 64}
Accuracy: 0.602
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.44      0.42       669
           1       0.71      0.68      0.70      1331

    accuracy                           0.60      2000
   macro avg       0.56      0.56      0.56      2000
weighted avg       0.61      0.60      0.60      2000



In [30]:
# Random Forest Model 1
best_modelR1, accuracyR1, reportR1 = train_random_forest_classifier(X_train1, X_test1, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 23, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 72}
Accuracy: 0.615
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.43      0.43       669
           1       0.71      0.71      0.71      1331

    accuracy                           0.61      2000
   macro avg       0.57      0.57      0.57      2000
weighted avg       0.62      0.61      0.62      2000



In [31]:
# Random Forest Model 2
best_modelR2, accuracyR2, reportR2 = train_random_forest_classifier(X_train2, X_test2, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 23, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 74}
Accuracy: 0.6015
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.39      0.40       669
           1       0.70      0.71      0.70      1331

    accuracy                           0.60      2000
   macro avg       0.55      0.55      0.55      2000
weighted avg       0.60      0.60      0.60      2000



In [32]:
# Random Forest Model 0
best_modelR3, accuracyR3, reportR3 = train_random_forest_classifier(X_train3, X_test3, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 23, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 74}
Accuracy: 0.6015
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.39      0.40       669
           1       0.70      0.71      0.70      1331

    accuracy                           0.60      2000
   macro avg       0.55      0.55      0.55      2000
weighted avg       0.60      0.60      0.60      2000



In [33]:
best_modelR4, accuracyR4, reportR4 = train_random_forest_classifier(X_train4, X_test4, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 23, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 66}
Accuracy: 0.614
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.35      0.38       669
           1       0.70      0.75      0.72      1331

    accuracy                           0.61      2000
   macro avg       0.55      0.55      0.55      2000
weighted avg       0.60      0.61      0.61      2000



In [34]:
best_modelR5, accuracyR5, reportR5 = train_random_forest_classifier(X_train5, X_test5, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 23, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 73}
Accuracy: 0.6205
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.35      0.38       669
           1       0.70      0.76      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.56      0.55      0.55      2000
weighted avg       0.61      0.62      0.61      2000



In [37]:
best_modelR6, accuracyR6, reportR6 = train_random_forest_classifier(X_train6, X_test6, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 24, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 61}
Accuracy: 0.614
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.31      0.35       669
           1       0.69      0.77      0.73      1331

    accuracy                           0.61      2000
   macro avg       0.54      0.54      0.54      2000
weighted avg       0.59      0.61      0.60      2000



In [36]:
best_modelR7, accuracyR7, reportR7 = train_random_forest_classifier(X_train7, X_test7, y_train, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'max_depth': 24, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 61}
Accuracy: 0.614
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.31      0.35       669
           1       0.69      0.77      0.73      1331

    accuracy                           0.61      2000
   macro avg       0.54      0.54      0.54      2000
weighted avg       0.59      0.61      0.60      2000



# XGBoost Implementation

In [19]:
!pip install xgboost



In [20]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform

def train_xgboost_classifier(X_train, X_test, y_train, y_test):
    
    # Define XGBoost Model and Parameter Distribution
    model = XGBClassifier(eval_metric='mlogloss', random_state=42)
    param_dist = {
        'n_estimators': randint(45, 75),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2),
        'subsample': uniform(0.3, 0.6),
        'colsample_bytree': uniform(0.3, 0.6),
        'min_child_weight': randint(1, 10)
    }

    # Cross-validation strategy
    cv_strategy = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                       n_iter=50, cv=cv_strategy, scoring='accuracy', 
                                       verbose=2, n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Get the Best Model and Evaluate
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

     # Evaluate the Model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
    return best_model, accuracy, report


In [40]:
best_modelX0, accuracyX0, reportX0 = train_xgboost_classifier(X_train0, X_test0, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.30795897669591993, 'learning_rate': 0.19844035113697056, 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 70, 'subsample': 0.4827682615040224}
Accuracy: 0.5985
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.45      0.43       669
           1       0.71      0.67      0.69      1331

    accuracy                           0.60      2000
   macro avg       0.56      0.56      0.56      2000
weighted avg       0.61      0.60      0.60      2000



In [41]:
best_modelX1, accuracyX1, reportX1 = train_xgboost_classifier(X_train1, X_test1, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.4218367348408616, 'learning_rate': 0.1985707141115962, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 66, 'subsample': 0.8282807034091546}
Accuracy: 0.5975
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.42      0.41       669
           1       0.70      0.69      0.69      1331

    accuracy                           0.60      2000
   macro avg       0.55      0.55      0.55      2000
weighted avg       0.60      0.60      0.60      2000



In [42]:
best_modelX2, accuracyX2, reportX2 = train_xgboost_classifier(X_train2, X_test2, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.6642205486120107, 'learning_rate': 0.06519983640450867, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 69, 'subsample': 0.8921321619603104}
Accuracy: 0.59
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.47      0.43       669
           1       0.71      0.65      0.68      1331

    accuracy                           0.59      2000
   macro avg       0.56      0.56      0.56      2000
weighted avg       0.61      0.59      0.60      2000



In [43]:
best_modelX3, accuracyX3, reportX3 = train_xgboost_classifier(X_train3, X_test3, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.6642205486120107, 'learning_rate': 0.06519983640450867, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 69, 'subsample': 0.8921321619603104}
Accuracy: 0.59
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.47      0.43       669
           1       0.71      0.65      0.68      1331

    accuracy                           0.59      2000
   macro avg       0.56      0.56      0.56      2000
weighted avg       0.61      0.59      0.60      2000



In [44]:
best_modelX4, accuracyX4, reportX4 = train_xgboost_classifier(X_train4, X_test4, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.6642205486120107, 'learning_rate': 0.06519983640450867, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 69, 'subsample': 0.8921321619603104}
Accuracy: 0.588
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.39      0.39       669
           1       0.69      0.69      0.69      1331

    accuracy                           0.59      2000
   macro avg       0.54      0.54      0.54      2000
weighted avg       0.59      0.59      0.59      2000



In [21]:
best_modelX5, accuracyX5, reportX5 = train_xgboost_classifier(X_train5, X_test5, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.4218367348408616, 'learning_rate': 0.1985707141115962, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 66, 'subsample': 0.8282807034091546}
Accuracy: 0.6165
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.32      0.36       669
           1       0.69      0.77      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.55      0.54      0.54      2000
weighted avg       0.60      0.62      0.60      2000



In [22]:
best_modelX6, accuracyX6, reportX6 = train_xgboost_classifier(X_train6, X_test6, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.4218367348408616, 'learning_rate': 0.1985707141115962, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 66, 'subsample': 0.8282807034091546}
Accuracy: 0.619
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.33      0.37       669
           1       0.69      0.76      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.55      0.55      0.55      2000
weighted avg       0.60      0.62      0.61      2000



In [23]:
best_modelX7, accuracyX7, reportX7 = train_xgboost_classifier(X_train7, X_test7, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.4218367348408616, 'learning_rate': 0.1985707141115962, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 66, 'subsample': 0.8282807034091546}
Accuracy: 0.619
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.33      0.37       669
           1       0.69      0.76      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.55      0.55      0.55      2000
weighted avg       0.60      0.62      0.61      2000



# Support Vector Macine (SVM)

In [33]:
from sklearn.svm import SVC

def train_svm_classifier(X_train, X_test, y_train, y_test):
    # Set up logging
    
    # Define SVM Model and Parameter Distribution
    model = SVC(probability=True, random_state=42)
    param_dist = {
        'C': uniform(0.1, 10),
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto'],
        'degree': randint(2, 5)  # Used only if kernel='poly'
    }

    # Cross-validation strategy
    cv_strategy = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                       n_iter=50, cv=cv_strategy, scoring='accuracy', 
                                       verbose=2, n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Get the Best Model and Evaluate
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

   # Evaluate the Model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
    return best_model, accuracy, report

# Example usage
# best_model, accuracy, report, confusion = train_svm_classifier(X_train, X_test, y_train, y_test)

In [None]:
best_modelS0, accuracyS0, reportS0 = train_svm_classifier(X_train0, X_test0, y_train, y_test)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


In [None]:
best_modelS1, accuracyS1, reportS1 = train_svm_classifier(X_train1, X_test1, y_train, y_test)

In [None]:
best_modelS2, accuracyS2, reportS2 = train_svm_classifier(X_train2, X_test2, y_train, y_test)

In [None]:
best_modelS3, accuracyS3, reportS3 = train_svm_classifier(X_train3, X_test3, y_train, y_test)

In [None]:
best_modelS4, accuracyS4, reportS4 = train_svm_classifier(X_train4, X_test4, y_train, y_test)

In [None]:
best_modelS5, accuracyS5, reportS5 = train_svm_classifier(X_train5, X_test5, y_train, y_test)

In [None]:
best_modelS6, accuracyS6, reportS6 = train_svm_classifier(X_train6, X_test6, y_train, y_test)

In [None]:
best_modelS7, accuracyS7, reportS7 = train_svm_classifier(X_train7, X_test7, y_train, y_test)

# Part 2: TFIDF Vectorized Text and LDA Similarity Score with Logistic Regression 

# Feature Engineering Similarity Score using LDA

# Tune and Train LDA Model based on Unifed Corpus for both Abstract and Related Application Text

In [24]:
from gensim import corpora
from gensim.models import CoherenceModel, LdaModel

def tune_lda_model(texts, topic_range, passes=15):
    """
    Tune LDA model by selecting the number of topics with the best coherence score.

    Parameters:
    texts (list of list of str): List of tokenized documents
    topic_range (list of int): List of topic numbers to try
    passes (int): Number of passes through the corpus during training

    Returns:
    best_lda_model (gensim.models.LdaModel): Best LDA model
    best_num_topics (int): Number of topics in the best model
    best_coherence (float): Coherence score of the best model
    """
    # Create a dictionary and corpus from the tokenized texts
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    best_lda_model = None
    best_num_topics = 0
    best_coherence = -1
    
    # Iterate over the topic range to find the best number of topics
    for num_topics in topic_range:
        # Train LDA model with the current number of topics
        lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
        
        # Calculate coherence score for the model
        coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        
        # Update the best model if the current model has a better coherence score
        if coherence_score > best_coherence:
            best_coherence = coherence_score
            best_num_topics = num_topics
            best_lda_model = lda_model
            
    return best_lda_model, best_num_topics, best_coherence


In [25]:
# This is different from the previously implemented preprocess function,
# This one does not join the tokens after setemming
def preprocess_text1(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return tokens


In [26]:

# Apply preprocessing to application_abstract column
df['application_abstract'] = df['application_abstract'].apply(preprocess_text1)

# Apply preprocessing to rel_app_text column
df['rel_app_text'] = df['rel_app_text'].apply(preprocess_text1)

# Combine the abstract and related application text for vocabulary fitting
combined_text = df['application_abstract'] + df['rel_app_text']

# Tokenize each document in combined_text
tokenized_combined_text = combined_text.tolist()

In [32]:
tokenized_unified_corpus = tokenized_combined_text
topic_range = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
passes = 20

tune_lda_model(tokenized_unified_corpus, topic_range, passes=passes) 


[CV] END .C=0.01, max_iter=100, penalty=l1, solver=liblinear; total time=   1.5s
[CV] END .C=0.01, max_iter=100, penalty=l2, solver=liblinear; total time=   0.4s
[CV] END .C=0.01, max_iter=200, penalty=l2, solver=liblinear; total time=   0.5s
[CV] END .C=0.01, max_iter=300, penalty=l2, solver=liblinear; total time=   0.6s
[CV] END ..C=0.1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.5s
[CV] END ..C=0.1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.5s
[CV] END ..C=0.1, max_iter=200, penalty=l2, solver=liblinear; total time=   0.5s
[CV] END ..C=0.1, max_iter=300, penalty=l2, solver=liblinear; total time=   0.6s
[CV] END ....C=1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.7s
[CV] END ....C=1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.5s
[CV] END ....C=1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.6s
[CV] END ....C=1, max_iter=300, penalty=l2, solver=liblinear; total time=   0.6s
[CV] END ...C=10, max_iter=1

(<gensim.models.ldamodel.LdaModel at 0x3349f2310>, 5, 0.5511164061267257)

# Define a function that extract the LDA topic distribution

In [28]:
def lda_topic_distribution(documents1, documents2, num_topics, passes=20):
    """
    Generate LDA topic distributions for two sets of documents using a unified dictionary.

    Parameters:
    documents1 (list of str): List of documents for the first text variable (application_abstact)
    documents2 (list of str): List of documents for the second text variable (rel_app_text)
    num_topics (int): Number of topics for the LDA model
    passes (int): Number of passes through the corpus during training

    Returns:
    lda_model (gensim.models.LdaModel): LDA model
    dictionary (gensim.corpora.Dictionary): Dictionary of the combined corpus
    corpus1 (list of list of (int, int)): Corpus for documents1 in bag-of-words format
    corpus2 (list of list of (int, int)): Corpus for documents2 in bag-of-words format
    doc_topic_matrix1 (numpy.ndarray): Dense matrix of topic distributions for documents1
    doc_topic_matrix2 (numpy.ndarray): Dense matrix of topic distributions for documents2
    """
    # Preprocess documents
    documents1 = [preprocess_text1(doc) for doc in documents1]
    documents2 = [preprocess_text1(doc) for doc in documents2]

    # Combine the documents for creating a unified dictionary
    unified_dictionary = documents1 + documents2

    # Create a dictionary and corpus from the combined texts
    dictionary = corpora.Dictionary(unified_dictionary)
    corpus_combined = [dictionary.doc2bow(text) for text in unified_dictionary]

    # Transform each document into the bag-of-words format using the unified dictionary
    corpus1 = [dictionary.doc2bow(text) for text in documents1]
    corpus2 = [dictionary.doc2bow(text) for text in documents2]

    # Train LDA model with the specified number of topics
    lda_model = LdaModel(corpus_combined, num_topics=num_topics, id2word=dictionary, passes=passes)

    # Get topic distributions for each document
    doc_topics1 = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in corpus1]
    doc_topics2 = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in corpus2]

    # Convert topic distributions to dense matrices
    num_terms = num_topics
    doc_topic_matrix1 = np.zeros((len(documents1), num_terms))
    for i, doc in enumerate(doc_topics1):
        for topic_num, prob in doc:
            doc_topic_matrix1[i, topic_num] = prob

    doc_topic_matrix2 = np.zeros((len(documents2), num_terms))
    for i, doc in enumerate(doc_topics2):
        for topic_num, prob in doc:
            doc_topic_matrix2[i, topic_num] = prob

    return lda_model, dictionary, corpus1, corpus2, doc_topic_matrix1, doc_topic_matrix2


In [29]:
# Call the lda_topic_distribution function on the train and 
X_train_abstract = X_train['application_abstract']
X_test_abstract = X_test['application_abstract']
X_train_rel_app_text = X_train['rel_app_text']
X_test_rel_app_text = X_test['rel_app_text']

# Define the number of topics and passes
num_topics = 5
passes = 20

# Call lda_topic_distribution for training set
lda_model_train, dictionary_train, corpus1_train, corpus2_train, train_abstract_topic_matrix1, train_rel_app_text_topic_matrix2 = lda_topic_distribution(X_train_abstract, X_train_rel_app_text, num_topics, passes)

# Save the topic matrices for the training set
np.save('train_abstract_topic_matrix1.npy',train_abstract_topic_matrix1 )
np.save('train_rel_app_text_topic_matrix2.npy', train_rel_app_text_topic_matrix2)

# Call lda_topic_distribution for testing set
lda_model_test, dictionary_test, corpus1_test, corpus2_test, test_abstract_topic_matrix1, test_rel_app_texttopic_matrix2 = lda_topic_distribution(X_test_abstract, X_test_rel_app_text, num_topics, passes)

# Save the topic matrices for the testing set
np.save('test_abstract_topic_matrix1.npy', test_abstract_topic_matrix1)
np.save('test_rel_app_texttopic_matrix2.npy', test_rel_app_texttopic_matrix2)

print("Training Set - Abstract Topic Matrix 1:\n", train_abstract_topic_matrix1 )
print("Training Set - Abstract Topic Matrix 2:\n", train_rel_app_text_topic_matrix2)
print("Testing Set - Related Document Topic Matrix 1:\n", test_abstract_topic_matrix1)
print("Testing Set - Related Document Topic Matrix 2:\n", test_rel_app_texttopic_matrix2)


Training Set - Abstract Topic Matrix 1:
 [[0.00233762 0.00233482 0.04670783 0.94627637 0.00234336]
 [0.11893915 0.34684557 0.00322854 0.52767819 0.00330857]
 [0.33140531 0.00220492 0.00214086 0.24563509 0.41861382]
 ...
 [0.00258584 0.19112583 0.00260732 0.62884307 0.17483792]
 [0.0033356  0.08346131 0.00332106 0.83913767 0.07074437]
 [0.74918604 0.00812212 0.00806491 0.2263454  0.00828149]]
Training Set - Abstract Topic Matrix 2:
 [[0.00714521 0.00715331 0.97140872 0.00714733 0.00714542]
 [0.0250001  0.02501273 0.89995837 0.02500156 0.02502721]
 [0.01055222 0.01053963 0.9578371  0.01053708 0.010534  ]
 ...
 [0.01429315 0.01429741 0.94283158 0.01428931 0.01428854]
 [0.00870284 0.00871434 0.96518445 0.00869831 0.0087001 ]
 [0.00833515 0.00833537 0.96665937 0.00833455 0.00833558]]
Testing Set - Related Document Topic Matrix 1:
 [[0.82592517 0.00248983 0.13519804 0.00243863 0.03394832]
 [0.16339615 0.40428215 0.0059554  0.42043769 0.00592861]
 [0.86302471 0.00217689 0.00217067 0.13046749 

# Calculate Cosine Similarity

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate cosine similarity for training data
lda_cosine_similarity_train = [
    cosine_similarity(
        train_abstract_topic_matrix1[i].reshape(1, -1),
        train_rel_app_text_topic_matrix2[i].reshape(1, -1)
    )[0, 0]
    for i in range(train_abstract_topic_matrix1.shape[0])
]

# Calculate cosine similarity for test data
lda_cosine_similarity_test = [
    cosine_similarity(
        test_abstract_topic_matrix1[i].reshape(1, -1),
        test_rel_app_texttopic_matrix2[i].reshape(1, -1)
    )[0, 0]
    for i in range(test_abstract_topic_matrix1.shape[0])
]

# Add cosine similarity to the dataframes
X_train['lda_cosine_similarity'] = lda_cosine_similarity_train
X_test['lda_cosine_similarity'] = lda_cosine_similarity_test

print(X_train['lda_cosine_similarity'].head())
print(X_test['lda_cosine_similarity'].head())


1253    0.056696
3725    0.048050
116     0.022322
1199    0.014405
1523    0.014921
Name: lda_cosine_similarity, dtype: float64
3431    0.054762
2042    0.032899
79      0.015785
4663    0.009433
3640    0.999941
Name: lda_cosine_similarity, dtype: float64


# Finally, We run a Baseline Logistic Regression Model

In [31]:
# Combine TF-IDF features with lda cosine similarity
X_train_combined_lda = np.hstack((X_train_abstract_tfidf.toarray(), np.array(lda_cosine_similarity_train).reshape(-1, 1)))
X_test_combined_lda = np.hstack((X_test_abstract_tfidf.toarray(), np.array(lda_cosine_similarity_test).reshape(-1, 1)))

# Step 5: Define the model and parameter grid for GridSearch
model = LogisticRegression()
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear'],
    'max_iter': [100, 200, 300]
}

# Step 6: Perform GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_combined_lda, y_train)

# Step 7: Get the best model and evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_combined_lda)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.619
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.32      0.36       669
           1       0.69      0.77      0.73      1331

    accuracy                           0.62      2000
   macro avg       0.55      0.54      0.54      2000
weighted avg       0.60      0.62      0.60      2000

