In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv', index_col=0)
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


### Data Preprocessing

In [None]:
import ast

df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x))

In [None]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [None]:
y = df['Tags']
y

2          [sql, asp.net]
4              [c#, .net]
5                   [c++]
6                  [.net]
8                    [c#]
                ...      
1262668             [c++]
1262834             [c++]
1262915          [python]
1263065          [python]
1263454             [c++]
Name: Tags, Length: 48976, dtype: object

In [None]:
multilabel = MultiLabelBinarizer()

In [None]:
y = multilabel.fit_transform(df['Tags'])
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
multilabel.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [None]:
 pd.DataFrame(y, columns=multilabel.classes_)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [None]:
tfidf = TfidfVectorizer(analyzer='word', max_features=5000, ngram_range=(1,2), stop_words='english')
X = tfidf.fit_transform(df['Text'])

In [None]:
X

<48976x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1790108 stored elements in Compressed Sparse Row format>

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
def j_score(y_true, y_pred):
  jaccard = np.minimum(y_true, y_pred).sum(axis =1) / np.maximum(y_true, y_pred).sum(axis=1)
  return jaccard.mean()* 100


def print_score(y_pred, clf):
  print('CLF:', clf.__class__.__name__)
  print('J Score:', j_score(y_test, y_pred))
  print('----')


In [None]:
# Model

sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

In [None]:
classifier= LinearSVC(C=1.5, penalty='l1', dual=False)
clf = OneVsRestClassifier(classifier)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print_score(y_pred, classifier)



CLF: LinearSVC
J Score: 62.22965155845923
----


In [None]:
 # for classifier in [sgd, lr, svc]:
#   clf = OneVsRestClassifier(classifier)
#   clf.fit(X_train, y_train)
#   y_pred = clf.predict(X_test)
#   print_score(y_pred, classifier)



In [None]:
##Testing
x = ['how to group sql table data using 1 column and take mean']

xt = tfidf.transform(x)

In [None]:
clf.predict(xt)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [None]:
multilabel.inverse_transform(clf.predict(xt))

[('sql',)]

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score, jaccard_score

def complete_metrics_report(y_test, predictions):
  # Calculate Hamming Loss
  hamming = hamming_loss(y_test, predictions)
  print(f"Hamming Loss: {hamming:.4f}")

  # Calculate Subset Accuracy
  subset_acc = accuracy_score(y_test, predictions)
  print(f"Subset Accuracy: {subset_acc:.4f}")

  # Calculate Precision, Recall, and F1-Score (macro-averaged)
  precision = precision_score(y_test, predictions, average='macro')
  recall = recall_score(y_test, predictions, average='macro')
  f1 = f1_score(y_test, predictions, average='macro')

  print(f"Precision (macro): {precision:.4f}")
  print(f"Recall (macro): {recall:.4f}")
  print(f"F1-Score (macro): {f1:.4f}")

  # Calculate Precision, Recall, and F1-Score (micro-averaged)
  precision_micro = precision_score(y_test, predictions, average='micro')
  recall_micro = recall_score(y_test, predictions, average='micro')
  f1_micro = f1_score(y_test, predictions, average='micro')

  print(f"Precision (micro): {precision_micro:.4f}")
  print(f"Recall (micro): {recall_micro:.4f}")
  print(f"F1-Score (micro): {f1_micro:.4f}")

  # Calculate Jaccard Score
  jaccard_macro = jaccard_score(y_test, predictions, average='macro')
  jaccard_micro = jaccard_score(y_test, predictions, average='micro')
  jaccard_samples = jaccard_score(y_test, predictions, average='samples')

  print(f"Jaccard Score (macro): {jaccard_macro:.4f}")
  print(f"Jaccard Score (micro): {jaccard_micro:.4f}")
  print(f"Jaccard Score (samples): {jaccard_samples:.4f}")

  # Calculate Binary Accuracy
  binary_acc = np.mean(np.equal(y_test, predictions).astype(int))
  print(f"Binary Accuracy: {binary_acc:.4f}")




In [None]:
# Define the neural network architecture
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1], activation='sigmoid'))  # Multi-label classification

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

tfidf = TfidfVectorizer(max_features=5000)
X1 = tfidf.fit_transform(df['Text']).toarray()


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)


# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Make predictions
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)  # Convert predictions to binary (0 or 1)


Epoch 1/20
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.3140 - loss: 0.2211 - val_accuracy: 0.6594 - val_loss: 0.0898
Epoch 2/20
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7058 - loss: 0.0805 - val_accuracy: 0.6824 - val_loss: 0.0837
Epoch 3/20
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7624 - loss: 0.0632 - val_accuracy: 0.6835 - val_loss: 0.0848
Epoch 4/20
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7928 - loss: 0.0530 - val_accuracy: 0.6813 - val_loss: 0.0889
Epoch 5/20
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8095 - loss: 0.0454 - val_accuracy: 0.6822 - val_loss: 0.0941
Epoch 6/20
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8366 - loss: 0.0376 - val_accuracy: 0.6801 - val_loss: 0.0998
Epoch 7/20
[1m980/980[0m 

In [None]:
complete_metrics_report(y_test, predictions)

Hamming Loss: 0.0326
Subset Accuracy: 0.5594
Precision (macro): 0.7364
Recall (macro): 0.6522
F1-Score (macro): 0.6904
Precision (micro): 0.7610
Recall (micro): 0.6909
F1-Score (micro): 0.7242
Jaccard Score (macro): 0.5431
Jaccard Score (micro): 0.5677
Jaccard Score (samples): 0.6702
Binary Accuracy: 0.9674
