In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

import numpy as np

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the datasets
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

lengthTestData = len(test_data)

In [3]:
# Combine text data for preprocessing
text = pd.concat([train_data['text'], val_data['text']], ignore_index=True)
text_test = test_data['text']

In [4]:
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercase
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(filtered_tokens)

In [6]:
text_preprocessed = text.apply(preprocess_text)

In [7]:
print(text_preprocessed)

0        beirut ( reuters ) - iran military chief met s...
1        hanoi ( reuters ) - top u.s. envoy began two-d...
2        ( reuters ) - four u.s. senator asked senate j...
3        first read morning briefing meet press nbc pol...
4        cairo ( reuters ) - six month egypt election ,...
                               ...                        
54714    lack oversight prof donald trump totally unfit...
54715    tucker carlson responded espn anchor calling p...
54716    getting something nothing rage president profe...
54717    black emanuelle fixed 1976. attila speaking eu...
54718    chaos broke legal american illegal alien clash...
Name: text, Length: 54719, dtype: object


In [8]:
print(text)

0        beirut (reuters) - iran s military chief met w...
1        hanoi (reuters) - a top u.s. envoy began a two...
2        (reuters) - four u.s. senators have asked the ...
3        first read is a morning briefing from meet the...
4        cairo (reuters) - six months before egypt s el...
                               ...                        
54714    this lack of oversight proves that donald trum...
54715    tucker carlson responded to an espn anchor cal...
54716    because getting something for nothing is all t...
54717    black emanuelle fixed all that in 1976. attila...
54718    chaos broke out after legal americans and ille...
Name: text, Length: 54719, dtype: object


In [9]:
# Initialize TF-IDF vectorizer without specifying max_features
tfidf_vectorizer = TfidfVectorizer()

In [10]:
# Fit and transform the preprocessed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_preprocessed)

In [11]:
# Get the number of unique tokens
num_unique_tokens = len(tfidf_vectorizer.get_feature_names_out())
print(num_unique_tokens)

169079


In [12]:
# Re-initialize TF-IDF vectorizer with the determined max_features
tfidf_vectorizer = TfidfVectorizer(max_features=num_unique_tokens)

In [13]:
#Fit and transform the text data again with the updated max_features
tfidf_matrix = tfidf_vectorizer.fit_transform(text_preprocessed)

# Convert the TF-IDF matrix to a CSR (Compressed Sparse Row) matrix for efficient row-wise operations
csr_tfidf_matrix = csr_matrix(tfidf_matrix)

# Find the row index with the maximum number of filled values
max_features_row_index = csr_tfidf_matrix.getnnz(axis=1).argmax()

# Get the number of features in the document with the most filled values
max_features = csr_tfidf_matrix[max_features_row_index].count_nonzero()

svd = TruncatedSVD(n_components=int(max_features*1.0))
tfidf_matrix = svd.fit_transform(tfidf_matrix)

In [14]:
#dense_tfidf_matrix = tfidf_matrix[:len(train_data)]
#dense_val_tfidf_matrix = tfidf_matrix[len(train_data):len(train_data) + len(val_data)]

# Merging the Validation and Training Data into one for a larger training dataset.
dense_tfidf_matrix = tfidf_matrix[:len(train_data) + len(val_data)]

In [15]:
# Convert them into Arrays
train_labels = train_data['label'].values
val_labels = val_data['label'].values

dense_labels = np.concatenate((train_data['label'].values, val_data['label'].values), axis=0)

In [16]:
# Create and train the LR model
log_reg_model = LogisticRegression(max_iter=1000, penalty='l2', multi_class='multinomial') # Initialize Logistic Regression model

In [17]:
#CROSS FOLD VALIDATION, DELETE IF NECESSARY

# Initialize KFold with the desired number of folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(log_reg_model, dense_tfidf_matrix, dense_labels, cv=kf, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Average CV accuracy:", cv_scores.mean())

Cross-validation scores: [0.95321637 0.95148026 0.9563231  0.95321637 0.95567943]
Average CV accuracy: 0.9539831081767136


In [18]:
log_reg_model.fit(dense_tfidf_matrix, dense_labels)  # Train the model

In [19]:
text_test_preprocessed = text_test.apply(preprocess_text)
test_tfidf_matrix = tfidf_vectorizer.transform(text_test_preprocessed)
dense_test_tfidf_matrix = svd.transform(test_tfidf_matrix)
test_labels = test_data['label'].values

test_predictions = log_reg_model.predict(dense_test_tfidf_matrix)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.9539549416214438


In [20]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(test_labels, test_predictions)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3420
           1       0.95      0.95      0.95      2661

    accuracy                           0.95      6081
   macro avg       0.95      0.95      0.95      6081
weighted avg       0.95      0.95      0.95      6081



In [21]:
#from joblib import dump
#dump(log_reg_model, 'log_reg_tfidf.joblib')

In [22]:
test_accuracy*lengthTestData

5801.0