In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import random

In [4]:
# Load the data
train_data = pd.read_csv('/content/sample_data/train.csv')
train_labels = pd.read_csv('/content/sample_data/trainLabels.csv')
test_data = pd.read_csv('/content/sample_data/test.csv')

In [5]:
train_data

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,1,NO,NO,dqOiM6yBYgnVSezBRiQXs9bvOFnRqrtIoXRIElxD7g8=,GNjrXXA3SxbgD0dTRblAPO9jFJ7AIaZnu/f48g5XSUk=,0.576561,0.073139,0.481394,0.115697,0.472474,...,0.0,0.810,3306,4676,YES,NO,YES,2,0.375535,0.464610
1,2,,,,,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.510,4678,3306,YES,NO,YES,4,0.741682,0.593630
2,3,NO,NO,ib4VpsEsqJHzDiyL0dZLQ+xQzDPrkxE+9T3mx5fv2wI=,X6dDAI/DZOWvu0Dg6gCgRoNr2vTUz/mc4SdHTNUPS38=,1.341803,0.051422,0.935572,0.041440,0.501710,...,0.0,0.850,4678,3306,NO,NO,NO,1,0.776467,0.493159
3,4,YES,NO,BfrqME7vdLw3suQp6YAT16W2piNUmpKhMzuDrVrFQ4w=,YGCdISifn4fLao/ASKdZFhGIq23oqzfSbUVb6px1pig=,0.653912,0.041471,0.940787,0.090851,0.556564,...,0.0,0.945,3306,4678,NO,NO,YES,3,0.168234,0.546582
4,5,NO,NO,RTjsrrR8DTlJyaIP9Q3Z8s0zseqlVQTrlSe97GCWfbk=,3yK2OPj1uYDsoMgsxsjY1FxXkOllD8Xfh20VYGqT+nU=,1.415919,0.000000,1.000000,0.000000,0.375297,...,0.0,1.000,1263,892,NO,NO,NO,1,0.246637,0.361045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,9995,NO,NO,jComfqYTXYSeH3GvqcOhwPmldb+BCdVJDKKDNkdtw2w=,Kr2CC15nSwDjdpyAeVh4vOuIaHuC/Q7cL9BAK28JoG8=,1.207136,0.082855,0.918960,0.313880,0.495189,...,0.0,0.810,4677,3307,YES,NO,YES,1,0.502268,0.486637
9995,9996,NO,NO,Pr5enXjzVzVjZziZxrDcgnyu6CLUftmEbnp6TctyJbU=,YvZUuCDjLu9VvkCdBWgARWQrvm+FSXgxp0zIrMjcLBc=,1.414798,0.000000,1.000000,0.000000,0.357369,...,0.0,1.000,1262,892,YES,NO,YES,15,0.890135,0.346276
9996,9997,YES,YES,9Avs0tL1zvH7Xx41z2UqrXs11/4IWLmqRAodLt/SKjQ=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,1.413677,0.000000,1.000000,0.000000,0.668517,...,0.0,1.000,1261,892,YES,NO,YES,5,0.726457,0.659001
9997,9998,NO,NO,9zkXU3f6YnRPjsWi3lKSCLseIGrleg00tpRI4OplABw=,gOZBAoajyr6i7GgON0N7q5+KE4JTwH3OUM0lZOWMuG8=,1.294118,0.000000,1.000000,0.000000,0.570707,...,1.0,1.000,1188,918,NO,NO,NO,3,0.450980,0.561448


In [None]:
# Assuming the text data is in a column named 'text'
texts = train_data['text_column']

# Bag of Words Vectorizer
bow_vectorizer = CountVectorizer(max_features=10000)
X_bow = bow_vectorizer.fit_transform(texts)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(texts)

# Combine both representations (optional)
from scipy.sparse import hstack
X_combined = hstack([X_bow, X_tfidf])

# Prepare labels for multilabel classification
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_labels.values)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the model
model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='ovr')
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict_proba(X_val)

# Compute ROC AUC score for each label
auc_scores = roc_auc_score(y_val, y_pred, average=None)
print(f'ROC AUC scores: {auc_scores}')


In [None]:
# Preprocess the test data
texts_test = test_data['text_column']
X_test_bow = bow_vectorizer.transform(texts_test)
X_test_tfidf = tfidf_vectorizer.transform(texts_test)
X_test_combined = hstack([X_test_bow, X_test_tfidf])

# Make predictions
test_pred = model.predict_proba(X_test_combined)

# Prepare the submission file
submission = pd.DataFrame(test_pred, columns=[f'{i+1}_y{j+1}' for i in range(len(test_pred)) for j in range(y.shape[1])])
submission.to_csv('/path/to/submission.csv', index=False)