In [1]:
pip install pandas scikit-learn nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas scikit-learn nltk xgboost


Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
import warnings

# Download stopwords and wordnet
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Remove stopwords and lemmatize
    return text

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Handle missing values
train_data['text'].fillna('', inplace=True)
test_data['text'].fillna('', inplace=True)
train_data['keyword'].fillna('', inplace=True)
test_data['keyword'].fillna('', inplace=True)
train_data['location'].fillna('', inplace=True)
test_data['location'].fillna('', inplace=True)

# Preprocess text data
train_data['clean_text'] = train_data['text'].apply(clean_text)
test_data['clean_text'] = test_data['text'].apply(clean_text)

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=10000), 'clean_text'),
        ('keyword', TfidfVectorizer(), 'keyword'),
        ('location', TfidfVectorizer(), 'location')
    ],
    remainder='drop'
)

# Train/test split for validation
X = train_data[['clean_text', 'keyword', 'location']]
y = train_data['target']
X_train_split, X_val, y_train_split, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with preprocessor and XGBoost
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# Hyperparameter tuning with simplified parameters
param_grid = {
    'preprocessor__text__max_features': [5000, 10000],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [4, 6],
    'clf__learning_rate': [0.1, 0.2]
}

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module='xgboost')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1', verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Evaluate on validation set
val_predictions = grid_search.predict(X_val)
val_f1 = f1_score(y_val, val_predictions)
print(f'Validation F1 Score: {val_f1}')

# Train the final model on the entire training data
best_model = grid_search.best_estimator_
best_model.fit(X, y)

# Predict on test data
X_test = test_data[['clean_text', 'keyword', 'location']]
test_predictions = best_model.predict(X_test)

# Create the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'target': test_predictions
})

submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END clf__learning_rate=0.1, clf__max_depth=4, clf__n_estimators=100, preprocessor__text__max_features=5000; total time=   0.8s
[CV] END clf__learning_rate=0.1, clf__max_depth=4, clf__n_estimators=100, preprocessor__text__max_features=5000; total time=   0.5s
[CV] END clf__learning_rate=0.1, clf__max_depth=4, clf__n_estimators=100, preprocessor__text__max_features=5000; total time=   0.5s
[CV] END clf__learning_rate=0.1, clf__max_depth=4, clf__n_estimators=100, preprocessor__text__max_features=5000; total time=   0.5s
[CV] END clf__learning_rate=0.1, clf__max_depth=4, clf__n_estimators=100, preprocessor__text__max_features=5000; total time=   0.5s
[CV] END clf__learning_rate=0.1, clf__max_depth=4, clf__n_estimators=100, preprocessor__text__max_features=10000; total time=   0.9s
[CV] END clf__learning_rate=0.1, clf__max_depth=4, clf__n_estimators=100, preprocessor__text__max_features=10000; total time=   0.6s
[CV] END clf_