## Library

In [1]:
import pandas as pd
import numpy as np
import re


## Getting Data

In [None]:
import os 

path = os.path.join(os.getcwd(),'Data/training_data.csv')
col_names = ['target','ids','date','flag','user','text']

df = pd.read_csv(path)

In [3]:
df.columns = ['target','ids','date','flag','user','text']

In [4]:
# Data for Detection
data = df[['target','text']]

In [5]:
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"http\S+|www\S+", "", text)  # Xóa URL
    text = re.sub(r'[^a-zA-Z\s]', " ", text)   # Chỉ giữ chữ cái và khoảng trắng
    text = " ".join(text.split())  # Chuẩn hóa khoảng trắng
    return text

data['text'] = data['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(clean_text)


## Detection

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score 


In [7]:
x = data['text'].values
y = data['target'].values

In [8]:
data['target'].value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 19)
vectorizer = TfidfVectorizer(stop_words = 'english')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [10]:
from sklearn.linear_model import LogisticRegression      # Logistic Regression
from sklearn.svm import SVC                              # SVM
from sklearn.ensemble import RandomForestClassifier      # Random Forest
from xgboost import XGBClassifier                        # type: ignore # XGBoost
from sklearn.naive_bayes import MultinomialNB, GaussianNB # Naive Bayes

In [11]:
models = {
    'lr_model' : LogisticRegression(max_iter = 1000),
    'nv_model' : MultinomialNB(),
    # 'svc_model': SVC(),
    'rd_model' : RandomForestClassifier(),
    'gaus_model': GaussianNB(),
    'xgboost'   : XGBClassifier()
}

def train_and_evaluate_models(models, x_train, y_train, x_test, y_test):
    results = {}  # Lưu kết quả đánh giá
    
    for name, model in models.items():
        print(f"\nTraining model: {name} ...")
        
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        acc = accuracy_score(y_test, y_pred)
        
        print(f"Accuracy for {name}: {acc:.4f}")
        results[name] = {"accuracy": acc}
    
    return results

In [None]:
results = train_and_evaluate_models(models, x_train, y_train, x_test, y_test)


Training model: lr_model ...
Accuracy for lr_model: 0.7801

Training model: nv_model ...
Accuracy for nv_model: 0.7612

Training model: rd_model ...
