In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import numpy as np

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

from scipy.sparse import hstack

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the datasets
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

lengthTestData = len(test_data)

In [3]:
# Combine text data for preprocessing
text = pd.concat([train_data['text'], val_data['text']], ignore_index=True)
text_test = test_data['text']

In [4]:
linguistic_features = pd.concat([train_data.iloc[:,-9:], val_data.iloc[:,-9:]], ignore_index=True)
test_linguistic_features = test_data.iloc[:,-9:]

In [5]:
linguistic_features

Unnamed: 0,Words_per_Sentence,Percentage_Questions,Percentage_First_Person_Singular,Percentage_Second_Person,Percentage_Third_Person,Percentage_Negation,Percentage_Exclusive,Percentage_Causation,Percentage_Sense
0,28.777778,0.000000,0.000000,0.000000,1.930502,0.000000,0.000000,2.702703,5.405405
1,32.900000,0.000000,0.000000,0.000000,2.735562,0.607903,1.519757,1.519757,4.863222
2,30.416667,0.000000,0.000000,0.000000,2.191781,0.547945,0.821918,1.095890,6.027397
3,21.254545,3.636364,0.085543,0.000000,4.790419,1.197605,1.710864,2.138580,3.079555
4,24.444444,0.000000,0.000000,0.090909,3.181818,0.545455,1.636364,2.181818,6.181818
...,...,...,...,...,...,...,...,...,...
54714,56.166667,16.666667,0.296736,0.000000,4.154303,1.186944,1.186944,0.593472,5.341246
54715,22.900000,0.000000,0.000000,0.000000,2.183406,0.436681,0.873362,0.000000,5.240175
54716,18.095238,14.285714,1.842105,3.684211,5.263158,1.052632,2.105263,1.315789,5.789474
54717,16.611354,10.043668,1.156677,2.576236,2.970557,1.445846,2.602524,1.393270,2.628812


In [6]:
test_linguistic_features

Unnamed: 0,Words_per_Sentence,Percentage_Questions,Percentage_First_Person_Singular,Percentage_Second_Person,Percentage_Third_Person,Percentage_Negation,Percentage_Exclusive,Percentage_Causation,Percentage_Sense
0,25.066667,0.000000,0.000000,0.265957,2.393617,0.000000,1.595745,2.393617,7.446809
1,23.333333,0.000000,0.000000,0.000000,2.857143,2.857143,2.857143,2.142857,2.857143
2,24.333333,0.000000,0.000000,0.000000,1.369863,1.369863,1.369863,0.000000,8.219178
3,27.466667,0.000000,0.404531,0.242718,3.559871,0.485437,0.566343,1.779935,3.478964
4,40.666667,0.000000,0.000000,0.000000,3.278689,0.000000,0.819672,3.278689,6.557377
...,...,...,...,...,...,...,...,...,...
6076,26.708333,8.333333,0.468019,0.624025,4.680187,0.936037,2.184087,1.560062,1.872075
6077,32.700000,0.000000,0.000000,0.000000,1.834862,0.000000,0.305810,0.000000,4.587156
6078,25.600000,0.000000,0.000000,0.000000,4.427083,0.520833,0.781250,3.125000,2.083333
6079,63.000000,0.000000,0.000000,0.000000,5.555556,0.000000,2.777778,0.793651,2.380952


In [7]:
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercase
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(filtered_tokens)

In [9]:
text_preprocessed = text.apply(preprocess_text)

In [10]:
print(text_preprocessed)

0        beirut ( reuters ) - iran military chief met s...
1        hanoi ( reuters ) - top u.s. envoy began two-d...
2        ( reuters ) - four u.s. senator asked senate j...
3        first read morning briefing meet press nbc pol...
4        cairo ( reuters ) - six month egypt election ,...
                               ...                        
54714    lack oversight prof donald trump totally unfit...
54715    tucker carlson responded espn anchor calling p...
54716    getting something nothing rage president profe...
54717    black emanuelle fixed 1976. attila speaking eu...
54718    chaos broke legal american illegal alien clash...
Name: text, Length: 54719, dtype: object


In [11]:
print(text)

0        beirut (reuters) - iran s military chief met w...
1        hanoi (reuters) - a top u.s. envoy began a two...
2        (reuters) - four u.s. senators have asked the ...
3        first read is a morning briefing from meet the...
4        cairo (reuters) - six months before egypt s el...
                               ...                        
54714    this lack of oversight proves that donald trum...
54715    tucker carlson responded to an espn anchor cal...
54716    because getting something for nothing is all t...
54717    black emanuelle fixed all that in 1976. attila...
54718    chaos broke out after legal americans and ille...
Name: text, Length: 54719, dtype: object


In [12]:
# Initialize TF-IDF vectorizer without specifying max_features
count_vectorizer = CountVectorizer()

In [13]:
# Fit and transform the preprocessed text data
count_matrix = count_vectorizer.fit_transform(text_preprocessed)

In [14]:
# Get the number of unique tokens
num_unique_tokens = len(count_vectorizer.get_feature_names_out())
print(num_unique_tokens)

169079


In [15]:
# Re-initialize TF-IDF vectorizer with the determined max_features
count_vectorizer = CountVectorizer(max_features=num_unique_tokens)

In [16]:
#Fit and transform the text data again with the updated max_features
count_matrix = count_vectorizer.fit_transform(text_preprocessed)

# Convert the TF-IDF matrix to a CSR (Compressed Sparse Row) matrix for efficient row-wise operations
csr_count_matrix = csr_matrix(count_matrix)

# Find the row index with the maximum number of filled values
max_features_row_index = csr_count_matrix.getnnz(axis=1).argmax()

# Get the number of features in the document with the most filled values
max_features = csr_count_matrix[max_features_row_index].count_nonzero()

svd = TruncatedSVD(n_components=int(max_features*0.3))
count_matrix = svd.fit_transform(count_matrix)

In [17]:
#dense_tfidf_matrix = tfidf_matrix[:len(train_data)]
#dense_val_tfidf_matrix = tfidf_matrix[len(train_data):len(train_data) + len(val_data)]

# Merging the Validation and Training Data into one for a larger training dataset.
#dense_count_matrix = count_matrix[:len(train_data) + len(val_data)]

In [18]:
# Concatenate linguistic features and TF-IDF matrix horizontally
dense_count_with_linguistic = hstack([count_matrix, csr_matrix(linguistic_features)])

In [19]:
# Convert them into Arrays
train_labels = train_data['label'].values
val_labels = val_data['label'].values

dense_labels = np.concatenate((train_data['label'].values, val_data['label'].values), axis=0)

In [20]:
# Create and train the SVM model
svm_model = SVC(kernel='rbf', C=5.0)
#svm_model = SVC(kernel='poly', C=20.0, degree=2, coef0=0.001)

In [21]:
# Define the base estimator (Decision Tree) with max depth 5
base_estimator = DecisionTreeClassifier(max_depth=5)

# Create an AdaBoost classifier with custom settings
adaboost_model = AdaBoostClassifier(
    estimator=base_estimator,  # Using the custom decision tree as the base estimator
    n_estimators=115,  # Increasing the number of estimators to 700
    learning_rate=0.5,  # Lowering the learning rate to 0.3
    algorithm='SAMME.R'  # Using 'SAMME.R' algorithm for real probability estimates
)

In [22]:
# Create and fit the GBM model
gbm_model = GradientBoostingClassifier(learning_rate=0.5, n_estimators=200, loss='exponential')

In [23]:
# Create and train the LR model
log_reg_model = LogisticRegression(max_iter=1000, penalty='l2', multi_class='multinomial') # Initialize Logistic Regression model

In [24]:
# Create and train the MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(600, 10), max_iter=300, activation='relu', solver='adam', learning_rate='adaptive', verbose=True)

In [25]:
# Define a list of tuples where each tuple contains a name for the model and the trained model
models = [
    ('SVM', svm_model),
    ('Logistic Regression', log_reg_model),
    ('Gradient Boosting', gbm_model),
    ('AdaBoost', adaboost_model),
    ('MLPClassifier', mlp_model)
]

In [26]:
# Create a VotingClassifier instance
voting_classifier = VotingClassifier(estimators=models, voting='hard', verbose=True)  # Use 'soft' voting

In [27]:
voting_classifier.fit(dense_count_with_linguistic, dense_labels)

[Voting] ...................... (1 of 5) Processing SVM, total=36.7min
[Voting] ...... (2 of 5) Processing Logistic Regression, total= 4.4min
[Voting] ....... (3 of 5) Processing Gradient Boosting, total=209.6min
[Voting] ................ (4 of 5) Processing AdaBoost, total=205.9min
Iteration 1, loss = 0.19295569
Iteration 2, loss = 0.10158637
Iteration 3, loss = 0.06861961
Iteration 4, loss = 0.04499214
Iteration 5, loss = 0.03048995
Iteration 6, loss = 0.02721402
Iteration 7, loss = 0.01655278
Iteration 8, loss = 0.01521542
Iteration 9, loss = 0.01765217
Iteration 10, loss = 0.01202516
Iteration 11, loss = 0.00764982
Iteration 12, loss = 0.01114141
Iteration 13, loss = 0.00676641
Iteration 14, loss = 0.00868804
Iteration 15, loss = 0.01508351
Iteration 16, loss = 0.01069626
Iteration 17, loss = 0.00612658
Iteration 18, loss = 0.00842267
Iteration 19, loss = 0.00513715
Iteration 20, loss = 0.00289721
Iteration 21, loss = 0.00504551
Iteration 22, loss = 0.00304401
Iteration 23, loss = 

In [29]:
text_test_preprocessed = text_test.apply(preprocess_text)
test_count_matrix = count_vectorizer.transform(text_test_preprocessed)
dense_test_count_matrix = svd.transform(test_count_matrix)
dense_test_count_with_linguistic = hstack([dense_test_count_matrix, csr_matrix(test_linguistic_features)])
test_labels = test_data['label'].values

test_predictions = voting_classifier.predict(dense_test_count_with_linguistic)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.9593816806446308


In [30]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(test_labels, test_predictions)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      3420
           1       0.95      0.96      0.95      2661

    accuracy                           0.96      6081
   macro avg       0.96      0.96      0.96      6081
weighted avg       0.96      0.96      0.96      6081



In [31]:
from joblib import dump
dump(voting_classifier, 'votingclassifier_model_cv.joblib')

['votingclassifier_model_cv.joblib']

In [32]:
test_accuracy*lengthTestData

5834.0