<a href="https://colab.research.google.com/github/Radha19-sriram/AIQOD-ASSIGNMENT-TASK/blob/main/aiqod.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

Load and Inspect the Data:

In [None]:
# Load training data
train_data = pd.read_csv('/content/train.csv')
# Load training labels
y_train = pd.read_csv('/content/trainLabels.csv')
# Load test data
test_data = pd.read_csv('/content/test.csv')

# Check the shape of the datasets
print("Shape of train_data:", train_data.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of test_data:", test_data.shape)

if 'id' in train_data.columns:
    train_data.set_index('id', inplace=True)
if 'id' in y_train.columns:
    y_train.set_index('id', inplace=True)
if 'id' in test_data.columns:
    test_data.set_index('id', inplace=True)


Shape of train_data: (9999, 146)
Shape of y_train: (49999, 34)
Shape of test_data: (1999, 146)


Data Preprocessing

In [None]:
# Combine text columns for both train and test data
train_data['combined_text'] = train_data.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
test_data['combined_text'] = test_data.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)

# Ensure no missing values in combined_text
train_data = train_data[train_data['combined_text'].notna()]
test_data = test_data[test_data['combined_text'].notna()]

# Align X_train and y_train by index
X_train = train_data['combined_text']
y_train = y_train.loc[train_data.index]

# Convert all labels to numeric type
y_train = y_train.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

# Check for label columns with only one unique value
single_class_columns = [col for col in y_train.columns if y_train[col].nunique() == 1]
print(f"Columns with a single class: {single_class_columns}")

# Remove these columns from y_train
y_train = y_train.drop(columns=single_class_columns)

# Check the shape after preprocessing
print("Shape of X_train after preprocessing:", X_train.shape)
print("Shape of y_train after preprocessing:", y_train.shape)
print("Shape of test_data after preprocessing:", test_data.shape)


Columns with a single class: ['y14']
Shape of X_train after preprocessing: (9999,)
Shape of y_train after preprocessing: (9999, 32)
Shape of test_data after preprocessing: (1999, 147)


Feature Extraction

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(test_data['combined_text'])

# Ensure shapes match
print("Shape of X_train_tfidf:", X_train_tfidf.shape)
print("Shape of y_train:", y_train.shape)

Shape of X_train_tfidf: (9999, 10000)
Shape of y_train: (9999, 32)


Model Training

In [None]:
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_tfidf, y_train)


Prediction

In [None]:
y_test_pred = model.predict_proba(X_test_tfidf)

Submission Preparation

In [None]:
submission = []
for i in range(len(test_data)):
    for j in range(len(y_test_pred)):
        for k in range(y_test_pred[j][i].shape[0]):
            submission.append([f"{test_data.index[i]}_y{k+1}", y_test_pred[j][i][k]])

# Convert to DataFrame
submission_df = pd.DataFrame(submission, columns=["id_label", "probability"])
# Save to CSV
import os

# Create the 'output' directory if it doesn't exist
if not os.path.exists('output'):
    os.makedirs('output')

# Save to CSV in the 'output' directory
submission_df.to_csv('output/submission.csv', index=False)

print("Submission file created successfully!")


Submission file created successfully!
