In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

import numpy as np

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the datasets
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

lengthTestData = len(test_data)

In [3]:
# Combine text data for preprocessing
text = pd.concat([train_data['text'], val_data['text']], ignore_index=True)
text_test = test_data['text']

In [4]:
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercase
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(filtered_tokens)

In [6]:
text_preprocessed = text.apply(preprocess_text)

In [7]:
print(text_preprocessed)

0        leap reason , several journal news outlet accu...
1        new york ( reuters ) - reuters/ipsos tracking ...
2        washington ( reuters ) - chinese finance minis...
3        ( reuters ) - republican donald trump name ind...
4        harare ( reuters ) - incoming zimbabwe leader ...
                               ...                        
64880    catherine j. frompovich continuation testimony...
64881    shawn helton 21st century wiresince late octob...
64882    “ make peaceful revolution impossible make vio...
64883    first lady melania trump busy praying sick chi...
64884    pin 1 ( antimedia ) come brute force , law enf...
Name: text, Length: 64885, dtype: object


In [8]:
print(text)

0        In a   leap of reason, a several journals and ...
1        NEW YORK (Reuters) - A Reuters/Ipsos tracking ...
2        WASHINGTON (Reuters) - Chinese Finance Ministe...
3        (Reuters) - Republican Donald Trump will name ...
4        HARARE (Reuters) - Incoming Zimbabwe leader Em...
                               ...                        
64880    By Catherine J. Frompovich This is the continu...
64881    Shawn Helton 21st Century WireSince late Octob...
64882    “Those who make peaceful revolution impossible...
64883    First Lady Melania Trump has been busy praying...
64884    Pin 1 \n( ANTIMEDIA ) When it comes to brute f...
Name: text, Length: 64885, dtype: object


In [9]:
# Initialize TF-IDF vectorizer without specifying max_features
tfidf_vectorizer = TfidfVectorizer()

In [10]:
# Fit and transform the preprocessed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_preprocessed)

In [11]:
# Get the number of unique tokens
num_unique_tokens = len(tfidf_vectorizer.get_feature_names_out())
print(num_unique_tokens)

223532


In [12]:
# Re-initialize TF-IDF vectorizer with the determined max_features
tfidf_vectorizer = TfidfVectorizer(max_features=num_unique_tokens)

In [13]:
#Fit and transform the text data again with the updated max_features
tfidf_matrix = tfidf_vectorizer.fit_transform(text_preprocessed)

# Convert the TF-IDF matrix to a CSR (Compressed Sparse Row) matrix for efficient row-wise operations
csr_tfidf_matrix = csr_matrix(tfidf_matrix)

# Find the row index with the maximum number of filled values
max_features_row_index = csr_tfidf_matrix.getnnz(axis=1).argmax()

# Get the number of features in the document with the most filled values
max_features = csr_tfidf_matrix[max_features_row_index].count_nonzero()

svd = TruncatedSVD(n_components=int(max_features*0.3))
tfidf_matrix = svd.fit_transform(tfidf_matrix)

In [14]:
#dense_tfidf_matrix = tfidf_matrix[:len(train_data)]
#dense_val_tfidf_matrix = tfidf_matrix[len(train_data):len(train_data) + len(val_data)]

# Merging the Validation and Training Data into one for a larger training dataset.
dense_tfidf_matrix = tfidf_matrix[:len(train_data) + len(val_data)]

In [15]:
# Convert them into Arrays
train_labels = train_data['label'].values
val_labels = val_data['label'].values

dense_labels = np.concatenate((train_data['label'].values, val_data['label'].values), axis=0)

In [16]:
# Create and train the SVM model
svm_model = SVC(kernel='rbf', C=5.0)
#svm_model = SVC(kernel='poly', C=20.0, degree=2, coef0=0.001)

#CROSS FOLD VALIDATION, DELETE IF NECESSARY

# Initialize KFold with the desired number of folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(svm_model, dense_tfidf_matrix, dense_labels, cv=kf, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Average CV accuracy:", cv_scores.mean())

In [17]:
svm_model.fit(dense_tfidf_matrix, dense_labels)

In [18]:
text_test_preprocessed = text_test.apply(preprocess_text)
test_tfidf_matrix = tfidf_vectorizer.transform(text_test_preprocessed)
dense_test_tfidf_matrix = svd.transform(test_tfidf_matrix)
test_labels = test_data['label'].values

test_predictions = svm_model.predict(dense_test_tfidf_matrix)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.9772538141470181


In [19]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(test_labels, test_predictions)
print(report)   

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3503
           1       0.98      0.98      0.98      3707

    accuracy                           0.98      7210
   macro avg       0.98      0.98      0.98      7210
weighted avg       0.98      0.98      0.98      7210



In [23]:
from joblib import dump
dump(svm_model, 'svm_model_tfidf_9772.joblib')

['svm_model_tfidf_9772.joblib']

In [21]:
print(lengthTestData)

7210


In [22]:
test_accuracy*lengthTestData

7046.0