In [1]:
import urllib.request
import zipfile
import os

# Define the URL and destination path
url = "https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip?dl=1"
zip_path = "stacksample.zip"
extract_path = "stacksample"

# Download the zip file
urllib.request.urlretrieve(url, zip_path)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Verify the extracted files
os.listdir(extract_path)

['Questions.csv', 'Answers.csv', 'Tags.csv']

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from scipy.sparse import csr_matrix

# Define file paths
questions_path = os.path.join(extract_path, 'Questions.csv')
tags_path = os.path.join(extract_path, 'Tags.csv')

# Load data
questions_df = pd.read_csv(questions_path, encoding='ISO-8859-1')
tags_df = pd.read_csv(tags_path, encoding='ISO-8859-1')

# Rename columns for merging
questions_df.rename(columns={'Id': 'question_id'}, inplace=True)
tags_df.rename(columns={'Id': 'question_id'}, inplace=True)

# Merge dataframes
data_df = pd.merge(questions_df, tags_df, on='question_id', how='left')

# Combine Title and Body into a single text column
data_df['Text'] = data_df['Title'].fillna('') + ' ' + data_df['Body'].fillna('')
data_df['Tag'] = data_df['Tag'].fillna('')
data_df['Tags'] = data_df['Tag'].str.split()

# Filter to include only top 10 most occurring tags
top_tags = data_df['Tags'].explode().value_counts().nlargest(10).index
data_df['Tags'] = data_df['Tags'].apply(lambda tags: [tag for tag in tags if tag in top_tags])
data_df = data_df[data_df['Tags'].map(len) > 0]

# Split into features and labels
X = data_df['Text']
y = data_df['Tags']

# Binarize tags
mlb = MultiLabelBinarizer()
y_binarized = mlb.fit_transform(y)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_binarized, test_size=0.2, random_state=42)

# Initialize and fit Hashing Vectorizer
hash_vectorizer = HashingVectorizer(n_features=2000, alternate_sign=False)
X_train_hash = hash_vectorizer.transform(X_train)
X_val_hash = hash_vectorizer.transform(X_val)

# Ensure data is in sparse matrix format
X_train_sparse = csr_matrix(X_train_hash)
X_val_sparse = csr_matrix(X_val_hash)

# Initialize the Logistic Regression model with regularization
log_reg = LogisticRegression(solver='saga', penalty='l2', max_iter=100)

# Wrap it with MultiOutputClassifier to handle multi-label classification
multi_target_model = MultiOutputClassifier(log_reg, n_jobs=-1)

# Train the model
multi_target_model.fit(X_train_sparse, y_train)

# Predict on validation set
y_val_pred = multi_target_model.predict(X_val_sparse)

# Evaluate the performance
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=mlb.classes_))


Validation Classification Report:
              precision    recall  f1-score   support

     android       0.82      0.65      0.73     18101
          c#       0.83      0.55      0.66     20459
         c++       0.87      0.60      0.71      9342
        html       0.61      0.23      0.34     11668
         ios       0.89      0.68      0.77      9401
        java       0.79      0.53      0.63     22817
  javascript       0.65      0.37      0.47     24900
      jquery       0.58      0.29      0.39     15796
         php       0.82      0.63      0.71     19778
      python       0.91      0.74      0.82     13086

   micro avg       0.79      0.52      0.63    165348
   macro avg       0.78      0.53      0.62    165348
weighted avg       0.77      0.52      0.62    165348
 samples avg       0.50      0.52      0.51    165348



  _warn_prf(average, modifier, msg_start, len(result))
