In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score




In [22]:
import nltk

# ---------------- Load data ----------------
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")


train_df = train_df.rename(columns={"comment_text": "text", "psychotic_depression": "label"})
test_df = test_df.rename(columns={"comment_text": "text"})

# ---------------- Preprocessing ----------------
# nltk.download("stopwords")
# nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

train_df["cleaned_text"] = train_df["text"].apply(clean_text)
test_df["cleaned_text"] = test_df["text"].apply(clean_text)





In [23]:

print(train_df.sample(10))

                      id                                               text  \
81686   da82184f0c83bb49  ...people displaced people... \n[number of peo...   
123091  926d9f19698cdcbd  "\nAdding:  If you really could not see how yo...   
2973    0803dfe4c5e84b27               No worries I see this has been done.   
158465  ee4fbeb6ceee71bb  "\nOddly enough, I am truly agnostic on this p...   
89270   eec837d78c760cc5  Ask yourself a question: What does a random ph...   
80160   d67c8994f235df5d  Wankel engine\nYour text is sure not documente...   
67544   b4befbd3588e33c0  Don't hide the incriminating photographs you N...   
159486  fed9fcfd8505a0ad                   , spent by private U.S. citizens   
124418  99a111f58d49a730  "\n\n Please do not vandalize pages, as you di...   
147131  37636eb681066661  Yohan cabaye  \n\nThe yohan cabaye info was no...   

        label                                       cleaned_text  
81686       0  people displaced people number people displace..

In [24]:
# ---------------- TF-IDF with bigrams ----------------
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = vectorizer.fit_transform(train_df["cleaned_text"])
y = train_df["label"]

X_test_final = vectorizer.transform(test_df["cleaned_text"])

In [31]:
print(X.shape)
print(test_df.shape)

(159571, 20000)
(153164, 3)


In [34]:




# ---------------- Train/Validation split ----------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------- Improved Logistic Regression ----------------
model = LogisticRegression(max_iter=300, class_weight="balanced")  
model.fit(X_train, y_train)


print("TRAINING DATA")
y_train_pred = model.predict(X_train)

print("Validation Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("TEST DATA")

# ---------------- Validation ----------------
y_val_pred = model.predict(X_val)
print(y_val.shape)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# ---------------- Predict on test.csv ----------------
test_predictions = model.predict(X_test_final)




TRAINING DATA
Validation Accuracy: 0.9486432286770696
              precision    recall  f1-score   support

           0       0.99      0.95      0.97    115421
           1       0.66      0.94      0.78     12235

    accuracy                           0.95    127656
   macro avg       0.83      0.95      0.87    127656
weighted avg       0.96      0.95      0.95    127656

TEST DATA
(31915,)
Validation Accuracy: 0.9373962086793044
              precision    recall  f1-score   support

           0       0.99      0.95      0.96     28856
           1       0.63      0.87      0.73      3059

    accuracy                           0.94     31915
   macro avg       0.81      0.91      0.85     31915
weighted avg       0.95      0.94      0.94     31915



In [37]:
from xgboost import XGBClassifier


# --- Step 3: Train XGBoost on sparse data ---
model_xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    tree_method='hist',  # Efficient for large sparse data
    n_jobs=-1,
    verbosity=1
)

model_xgb.fit(X_train, y_train)

# --- Step 4: Evaluate ---
y_pred_xgb_val = model_xgb.predict(X_val)
acc = accuracy_score(y_val, y_pred_xgb_val)

print(f"Accuracy: {acc:.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9561


In [38]:
print(classification_report(y_val, y_pred_xgb_val))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28856
           1       0.90      0.61      0.73      3059

    accuracy                           0.96     31915
   macro avg       0.93      0.80      0.85     31915
weighted avg       0.95      0.96      0.95     31915

