In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
import numpy as np

# Load the datasets
train_df = pd.read_csv('/content/drive/MyDrive/Stats201_FinalProject/Final_Project/train_filtered.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Stats201_FinalProject/Final_Project/test_filtered.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Stats201_FinalProject/Final_Project/val_filtered.csv')

# Extract text and labels
X_train = train_df['text']
y_train = train_df['label']
X_test = test_df['text']
y_test = test_df['label']
X_val = val_df['text']
y_val = val_df['label']

# Introduce noise by randomly shuffling some labels (mislabeling data)
np.random.seed(42)
noisy_indices = np.random.choice(len(y_train), size=int(len(y_train) * 0.1), replace=False)
y_train[noisy_indices] = 1 - y_train[noisy_indices]  # Flip 10% of labels

# Create a pipeline with a very limited TF-IDF vectorizer and a decision tree
model = make_pipeline(
    TfidfVectorizer(stop_words='english', max_features=50, ngram_range=(1, 1)),  # Extremely limited features and unigrams
    DecisionTreeClassifier(max_depth=2, random_state=42)  # Very shallow tree to limit model complexity
)

# Train the model
model.fit(X_train, y_train)

# Predict on the validation set and evaluate the performance
y_pred = model.predict(X_val)
print("Validation Set Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report on Validation Set:\n", classification_report(y_val, y_pred))

# Predict on the test set and evaluate the performance
y_test_pred = model.predict(X_test)
print("Test Set Accuracy:", accuracy_score(y_test, y_test_pred))
print("Classification Report on Test Set:\n", classification_report(y_test, y_test_pred))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train[noisy_indices] = 1 - y_train[noisy_indices]  # Flip 10% of labels


Validation Set Accuracy: 0.8696132596685083
Classification Report on Validation Set:
               precision    recall  f1-score   support

           0       0.85      0.90      0.87      1374
           1       0.89      0.84      0.86      1341

    accuracy                           0.87      2715
   macro avg       0.87      0.87      0.87      2715
weighted avg       0.87      0.87      0.87      2715

Test Set Accuracy: 0.8859259259259259
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89      1328
           1       0.92      0.85      0.88      1372

    accuracy                           0.89      2700
   macro avg       0.89      0.89      0.89      2700
weighted avg       0.89      0.89      0.89      2700

