In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [2]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [3]:
#Unzip and load training data
with zipfile.ZipFile('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('../working/jigsaw-toxic-comment-classification-challenge/')
df_train = pd.read_csv('../working/jigsaw-toxic-comment-classification-challenge/train.csv')

# Unzip and load test data
with zipfile.ZipFile('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('../working/jigsaw-toxic-comment-classification-challenge/')
df_test = pd.read_csv('../working/jigsaw-toxic-comment-classification-challenge/test.csv')

# Unzip and load test labels
with zipfile.ZipFile('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('../working/jigsaw-toxic-comment-classification-challenge/')
df_test_label = pd.read_csv('../working/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

# Unzip and load sample submission
with zipfile.ZipFile('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('../working/jigsaw-toxic-comment-classification-challenge/')
df_sample_submission = pd.read_csv('../working/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

In [4]:
# Examine the Files
print(df_train.head())
print(df_test.head())
print(df_test_label.head())
print(df_sample_submission.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1  0000247867823ef7  ==

In [5]:
# Examine the Files
print(df_train.describe())
print(df_test.describe())
print(df_test_label.describe())
print(df_sample_submission.describe())

               toxic   severe_toxic        obscene         threat  \
count  159571.000000  159571.000000  159571.000000  159571.000000   
mean        0.095844       0.009996       0.052948       0.002996   
std         0.294379       0.099477       0.223931       0.054650   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

              insult  identity_hate  
count  159571.000000  159571.000000  
mean        0.049364       0.008805  
std         0.216627       0.093420  
min         0.000000       0.000000  
25%         0.000000       0.000000  
50%         0.000000       0.000000  
75%         0.000000       0.000000  
max         1.000000       1.000000  
                      id            

In [6]:
import re

def remove_special_characters(text): # Hàm loại bỏ các ký tự đặc biệt trong text
    text = re.sub(r'http\S+', ' ', text )
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\bhttps?://[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+\b', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d', ' ', text)  # Corrected line
    text= re.sub(r'[\u4e00-\u9fff]+', ' ', text)
    return text # Trả về các text đã được làm sạch

df_train['comment_text'] = df_train['comment_text'].apply(remove_special_characters) # Gán DataFrame sau khi làm sạch
df_test['comment_text'] = df_test['comment_text'].apply(remove_special_characters)

print(df_train['comment_text'].head(100))


0     Explanation Why the edits made under my userna...
1     D aww He matches this background colour I m se...
2     Hey man I m really not trying to edit war It s...
3     More I can t make any real suggestions on impr...
4     You sir are my hero Any chance you remember wh...
                            ...                        
95    Thanks I can see that violating clearly stated...
96     Hi Thanks for our kind words See you around Talk
97    Collusion in poker This is regarded as most he...
98    Thanks much however if it s been resolved why ...
99    You can do all you re doing right now but if y...
Name: comment_text, Length: 100, dtype: object


In [7]:
!pip install nltk



In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import string
from nltk import word_tokenize

df_train['tokens'] = df_train['comment_text'].apply(word_tokenize) #thực hiện tokenization, tức là chia văn bản trong cột comment_text của DataFrame df_train và df_test thành các tokens (các từ riêng lẻ)
df_test['tokens'] = df_test['comment_text'].apply(word_tokenize)

In [10]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df_train, train_size=0.8, random_state=42) # chia DataFrame df_train thành 2 tập con với tỉ lệ 80%/20%

In [73]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
train_texts, val_texts, train_labels, val_labels = train_test_split(df_train['comment_text'], df_train['label'], test_size=0.2, random_state=42)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

# Chuyển đổi văn bản thành các đặc trưng sử dụng TF-IDF
# Chuyển đổi văn bản thành các đặc trưng sử dụng TF-IDF
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2), min_df=3, max_df=0.9, strip_accents='unicode', 
    use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words='english'
)

In [78]:
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Khởi tạo và huấn luyện mô hình Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(train_features, train_labels)

In [81]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Dự đoán xác suất trên tập kiểm tra
val_probs = lr_model.predict_proba(val_features)[:, 1]

# Dự đoán nhãn trên tập kiểm tra
val_predictions = lr_model.predict(val_features)

# Đánh giá hiệu suất của mô hình
accuracy = accuracy_score(val_labels, val_predictions)
report = classification_report(val_labels, val_predictions)
roc_auc = roc_auc_score(val_labels, val_probs)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print(f"ROC-AUC Score: {roc_auc}")

Accuracy: 0.9550994830017233
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     28859
           1       0.93      0.57      0.71      3056

    accuracy                           0.96     31915
   macro avg       0.94      0.78      0.84     31915
weighted avg       0.95      0.96      0.95     31915

ROC-AUC Score: 0.970576883199394


In [88]:
# Tạo DataFrame dự đoán
df_sample_submission = pd.DataFrame({
    'comment_text': val_texts,
    'predicted_label': val_predictions
})


In [89]:
df_sample_submission.to_csv('submission.csv', index=False)
