In [1]:
import pandas as pd

# Load the preprocessed data from CSV files
train_data = pd.read_csv("train.csv")
val_data = pd.read_csv("val.csv")
test_data = pd.read_csv("test.csv")

In [2]:
X_train = train_data['text']
y_train = train_data['label']

X_val = val_data['text']
y_val = val_data['label']

X_test = test_data['text']
y_test = test_data['label']

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use CountVectorizer for simple tokenization and vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train a simple Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Predict on the validation data
y_val_pred = model.predict(X_val_vec)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print(classification_report(y_val, y_val_pred))


Validation Accuracy: 0.978264343934793
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2970
           1       0.98      0.98      0.98      3287

    accuracy                           0.98      6257
   macro avg       0.98      0.98      0.98      6257
weighted avg       0.98      0.98      0.98      6257


In [5]:
X_train.tail()


25020    the leftist media was quick to blame donald tr...
25021    u.s. presidential candidate donald trump on we...
25022    new president emmerson mnangagwa laid out a gr...
25023    the afghan taliban on wednesday called on u.s....
25024    the ny daily news is never shy when it comes t...
Name: text, dtype: object

In [6]:
y_train.tail()


25020    1
25021    0
25022    0
25023    0
25024    1
Name: label, dtype: int64