In [44]:
# Import pandas library
import pandas as pd

# Read the CSV file
df = pd.read_csv('data.csv')

# Create binary columns directly from the comma-separated strings
# This avoids the explode operation
binary_columns = df['related_topics'].str.get_dummies(sep=',')

# Clean up column names by removing any whitespace
binary_columns.columns = binary_columns.columns.str.strip()

# Join the binary columns back to the original dataframe
df = df.join(binary_columns)

# Display the first few rows of the transformed dataset
print("First few rows of the transformed dataset:")
print(df.head())

# Display all column names to verify the new binary columns
print("\nAll columns in the dataset:")
print(df.columns.tolist())

First few rows of the transformed dataset:
   id                                           title  \
0   1                                         Two Sum   
1   2                                 Add Two Numbers   
2   3  Longest Substring Without Repeating Characters   
3   4                     Median of Two Sorted Arrays   
4   5                   Longest Palindromic Substring   

                                         description  is_premium difficulty  \
0  Given an array of integers `nums` and an integ...           0       Easy   
1  You are given two non-empty linked lists repre...           0     Medium   
2  Given a string `s`, find the length of the lon...           0     Medium   
3  Given two sorted arrays `nums1` and `nums2` of...           0       Hard   
4  Given a string `s`, return the longest palindr...           0     Medium   

                                       solution_link  acceptance_rate  \
0                                  /articles/two-sum             4

In [45]:
# Select specific columns and all binary columns (which start from index 19)
selected_columns = ['description', 'difficulty'] + df.columns[19:].tolist()

# Create new dataframe with only selected columns
df = df[selected_columns]

# Display the first few rows of the filtered dataset
print("First few rows of the filtered dataset:")
print(df.head())

# Display all column names to verify
print("\nAll columns in the dataset:")
print(df.columns.tolist())

print(df.to_csv('final.csv'))

First few rows of the filtered dataset:
                                         description difficulty  Array  \
0  Given an array of integers `nums` and an integ...       Easy      1   
1  You are given two non-empty linked lists repre...     Medium      0   
2  Given a string `s`, find the length of the lon...     Medium      0   
3  Given two sorted arrays `nums1` and `nums2` of...       Hard      1   
4  Given a string `s`, return the longest palindr...     Medium      0   

   Backtracking  Binary Indexed Tree  Binary Search  Binary Search Tree  \
0             0                    0              0                   0   
1             0                    0              0                   0   
2             0                    0              0                   0   
3             0                    0              1                   0   
4             0                    0              0                   0   

   Bit Manipulation  Brainteaser  Breadth-first Search  ...  Sli

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

# Prepare features (X) and labels (y)
# Get all label columns (everything except description, acceptance_rate, and difficulty)
label_columns = df.columns[2:].tolist()  # Assuming they start after the 3rd column
valid_columns = []
print(label_columns)
for col in label_columns:
    if (df[col].sum()) > 10:
        # print(len(df[col].unique()))
        valid_columns.append(col)

print(f"Original number of label columns: {len(label_columns)}")
print(f"Number of valid label columns: {len(valid_columns)}")

X = df[['description', 'difficulty']]
y = df[valid_columns]
    


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print(y_train)

# Create feature processing pipeline
# 1. Process text descriptions
text_features = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english'))
])

# 2. Process difficulty (convert to numeric)
le = LabelEncoder()
X_train['difficulty_encoded'] = le.fit_transform(X_train['difficulty'])
X_test['difficulty_encoded'] = le.transform(X_test['difficulty'])

# Create feature matrix
X_train_text = text_features.fit_transform(X_train['description'])
X_test_text = text_features.transform(X_test['description'])

# Combine text features with difficulty
X_train_combined = np.hstack((
    X_train_text.toarray(),
    X_train['difficulty_encoded'].values.reshape(-1, 1)
))
X_test_combined = np.hstack((
    X_test_text.toarray(),
    X_test['difficulty_encoded'].values.reshape(-1, 1)
))
# print(X_train_combined.shape, text_features.get_feature_names_out()[100:200])
# Create and train the multilabel classifier
classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000))
classifier.fit(X_train_combined, y_train)

# Make predictions
y_pred = classifier.predict(X_test_combined)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=valid_columns))



['Array', 'Backtracking', 'Binary Indexed Tree', 'Binary Search', 'Binary Search Tree', 'Bit Manipulation', 'Brainteaser', 'Breadth-first Search', 'Depth-first Search', 'Dequeue', 'Design', 'Divide and Conquer', 'Dynamic Programming', 'Geometry', 'Graph', 'Greedy', 'Hash Table', 'Heap', 'Line Sweep', 'Linked List', 'Math', 'Meet in the Middle', 'Memoization', 'Minimax', 'OOP', 'Ordered Map', 'Queue', 'Random', 'Recursion', 'Rejection Sampling', 'Reservoir Sampling', 'Rolling Hash', 'Segment Tree', 'Sliding Window', 'Sort', 'Stack', 'String', 'Suffix Array', 'Topological Sort', 'Tree', 'Trie', 'Two Pointers', 'Union Find']
Original number of label columns: 43
Number of valid label columns: 27
Classification Report:
                      precision    recall  f1-score   support

               Array       0.57      0.13      0.21        62
        Backtracking       0.00      0.00      0.00        13
       Binary Search       0.00      0.00      0.00        23
    Bit Manipulation       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [86]:
# Function to predict topics for new problems

def predict_topics(description, difficulty, threshold=0.13):
    # Process the text
    text_features_new = text_features.transform([description])
    
    # Process difficulty
    difficulty_encoded = le.transform([difficulty])
    
    # Combine features
    X_new = np.hstack((
        text_features_new.toarray(),
        difficulty_encoded.reshape(-1, 1)
    ))
    
    # Get probability predictions
    pred_proba = classifier.predict_proba(X_new)
    # Convert probabilities to predictions using threshold
    pred = [(p[0][1] > threshold) for p in pred_proba]
    
    # Get predicted topics and their probabilities
    predicted_topics = []
    for label, p, should_include in zip(valid_columns, pred_proba, pred):
        if should_include:
            probability = p[0][1]  # probability of class 1
            predicted_topics.append((label, round(probability, 3)))
    
    # Sort by probability in descending order
    predicted_topics.sort(key=lambda x: x[1], reverse=True)
    return predicted_topics

In [87]:
# Example usage
# example_description = "Given an array of integers, return indices of the two numbers that add up to a specific target."
example_description = """You are given three integers n, m, k. A good array arr of size n is defined as follows:

Each element in arr is in the inclusive range [1, m].
Exactly k indices i (where 1 <= i < n) satisfy the condition arr[i - 1] == arr[i].
Return the number of good arrays that can be formed.

Since the answer may be very large, return it modulo 109 + 7.

 

Example 1:

Input: n = 3, m = 2, k = 1

Output: 4

Explanation:

There are 4 good arrays. They are [1, 1, 2], [1, 2, 2], [2, 1, 1] and [2, 2, 1].
Hence, the answer is 4.
Example 2:

Input: n = 4, m = 2, k = 2

Output: 6

Explanation:

The good arrays are [1, 1, 1, 2], [1, 1, 2, 2], [1, 2, 2, 2], [2, 1, 1, 1], [2, 2, 1, 1] and [2, 2, 2, 1].
Hence, the answer is 6.
Example 3:

Input: n = 5, m = 2, k = 0

Output: 2

Explanation:

The good arrays are [1, 2, 1, 2, 1] and [2, 1, 2, 1, 2]. Hence, the answer is 2."""
example_difficulty = "Hard"
predicted_topics = predict_topics(example_description, example_difficulty)
print("\nPredicted topics for example problem:")
print(predicted_topics)

[array([[0.7074814, 0.2925186]]), array([[0.98170221, 0.01829779]]), array([[0.90111911, 0.09888089]]), array([[0.96185826, 0.03814174]]), array([[0.98171589, 0.01828411]]), array([[0.97487441, 0.02512559]]), array([[0.9809914, 0.0190086]]), array([[0.99212051, 0.00787949]]), array([[0.84772279, 0.15227721]]), array([[0.98527798, 0.01472202]]), array([[0.92085807, 0.07914193]]), array([[0.86870444, 0.13129556]]), array([[0.9798005, 0.0201995]]), array([[0.98956263, 0.01043737]]), array([[0.86649775, 0.13350225]]), array([[0.99345086, 0.00654914]]), array([[0.99547218, 0.00452782]]), array([[0.98768656, 0.01231344]]), array([[0.98807559, 0.01192441]]), array([[0.98580686, 0.01419314]]), array([[0.9426145, 0.0573855]]), array([[0.96987605, 0.03012395]]), array([[0.95190972, 0.04809028]]), array([[0.97953173, 0.02046827]]), array([[0.99245499, 0.00754501]]), array([[0.94890768, 0.05109232]]), array([[0.98668574, 0.01331426]])]
[0.292518599997547, 0.01829779407171199, 0.09888089402104511, 