In [107]:
import pandas as pd
pd.options.display.max_columns=200
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.pipeline import make_pipeline
import bs4 as bs
import nltk
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
import re
import spacy
from nltk.corpus import stopwords
import ast
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [108]:
data = pd.read_csv('data/QueryResults.csv')

In [109]:
#delete balises html with beautiful soup
def clean_html(text):
    soup = bs.BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['Body'] = data['Body'].apply(clean_html)

In [110]:
data = data[['Title', 'Body', 'Tags', 'Id']]

In [111]:
data.head()


Unnamed: 0,Title,Body,Tags,Id
0,how do I calculate a rolling idxmax,consider the pd.Series s\nimport pandas as pd\...,<python><pandas><numpy><dataframe><series>,40101130
1,Better techniques for trimming leading zeros i...,I've been using this for some time:\nSUBSTRING...,<sql><sql-server><sql-server-2005><tsql><string>,662383
2,"""No X11 DISPLAY variable"" - what does it mean?",I am trying to install a Java application on m...,<java><linux><variables><x11><headless>,662421
3,Object-Oriented Callbacks for C++?,Is there some library that allows me to easily...,<c++><oop><callback><pointer-to-member><eiffel>,3520133
4,Why doesn't .NET find the OpenSSL.NET dll?,"EDIT (the whole question, it was too unclear)\...",<c#><.net><dll><dllimport><dllnotfoundexception>,1396164


In [112]:
import re

# Suppression des chevrons et extraction des tags
data['Tags'] = data['Tags'].apply(lambda x: re.findall(r'<([^>]+)>', x))

data.head()

Unnamed: 0,Title,Body,Tags,Id
0,how do I calculate a rolling idxmax,consider the pd.Series s\nimport pandas as pd\...,"[python, pandas, numpy, dataframe, series]",40101130
1,Better techniques for trimming leading zeros i...,I've been using this for some time:\nSUBSTRING...,"[sql, sql-server, sql-server-2005, tsql, string]",662383
2,"""No X11 DISPLAY variable"" - what does it mean?",I am trying to install a Java application on m...,"[java, linux, variables, x11, headless]",662421
3,Object-Oriented Callbacks for C++?,Is there some library that allows me to easily...,"[c++, oop, callback, pointer-to-member, eiffel]",3520133
4,Why doesn't .NET find the OpenSSL.NET dll?,"EDIT (the whole question, it was too unclear)\...","[c#, .net, dll, dllimport, dllnotfoundexception]",1396164


In [113]:
data['Tags'].head()

0          [python, pandas, numpy, dataframe, series]
1    [sql, sql-server, sql-server-2005, tsql, string]
2             [java, linux, variables, x11, headless]
3     [c++, oop, callback, pointer-to-member, eiffel]
4    [c#, .net, dll, dllimport, dllnotfoundexception]
Name: Tags, dtype: object

In [114]:
data.head()

Unnamed: 0,Title,Body,Tags,Id
0,how do I calculate a rolling idxmax,consider the pd.Series s\nimport pandas as pd\...,"[python, pandas, numpy, dataframe, series]",40101130
1,Better techniques for trimming leading zeros i...,I've been using this for some time:\nSUBSTRING...,"[sql, sql-server, sql-server-2005, tsql, string]",662383
2,"""No X11 DISPLAY variable"" - what does it mean?",I am trying to install a Java application on m...,"[java, linux, variables, x11, headless]",662421
3,Object-Oriented Callbacks for C++?,Is there some library that allows me to easily...,"[c++, oop, callback, pointer-to-member, eiffel]",3520133
4,Why doesn't .NET find the OpenSSL.NET dll?,"EDIT (the whole question, it was too unclear)\...","[c#, .net, dll, dllimport, dllnotfoundexception]",1396164


In [115]:
data['Text'] = data['Title'] + ' ' + data['Body']

In [116]:
from collections import Counter

# Compter les tags
tags_counter = Counter()
for tags in data['Tags']:
    tags_counter.update(tags)

# Nombre de tags différents
print("Nombre de tags différents :", len(tags_counter))

# Obtenir les 100 tags les plus fréquents
top_50_tags = tags_counter.most_common(50)
print("\nTop 50 tags les plus fréquents :")
for tag, count in top_50_tags:
    print(f"{tag}: {count}")

Nombre de tags différents : 18315

Top 50 tags les plus fréquents :
c#: 6464
java: 5846
javascript: 4876
python: 4699
c++: 3868
.net: 3488
ios: 3473
android: 3187
html: 2154
php: 2005
objective-c: 1890
jquery: 1785
iphone: 1705
c: 1592
asp.net: 1345
sql: 1337
css: 1303
linux: 1267
node.js: 1142
spring: 1051
performance: 1049
swift: 1026
windows: 1023
ruby-on-rails: 961
xcode: 931
json: 913
mysql: 911
sql-server: 887
multithreading: 843
asp.net-mvc: 805
unit-testing: 787
database: 785
ruby: 781
arrays: 771
django: 770
wpf: 760
macos: 738
visual-studio: 732
c++11: 699
reactjs: 695
algorithm: 692
string: 673
python-3.x: 592
xml: 545
ajax: 535
cocoa-touch: 518
gcc: 508
security: 503
angular: 496
eclipse: 487


In [117]:
top_50_tags_set = set(tag for tag, _ in top_50_tags)

# Fonction pour ne garder que les tags du top 100
def keep_top_50_tags(tags):
    return [tag for tag in tags if tag in top_50_tags_set]

# Appliquer la fonction 
data_filtered = data.copy()
data_filtered['Tags'] = data_filtered['Tags'].apply(keep_top_50_tags)

# Filtrer les données pour ne garder que les questions ayant au moins un tag dans le top 100
data_filtered = data_filtered[data_filtered['Tags'].apply(lambda x: len(x) > 0)].reset_index(drop=True)

# Afficher le nombre de questions restantes
print("Nombre de questions restantes :", len(data_filtered))

Nombre de questions restantes : 43783


In [118]:
import pandas as pd

# Compter le nombre de tokens dans chaque texte
data_filtered['Token_Count'] = data_filtered['Text'].apply(lambda x: len(x.split()))

In [119]:
data_filtered['Token_Count'].describe()

count    43783.000000
mean       189.524176
std        184.684547
min          8.000000
25%         87.000000
50%        140.000000
75%        230.000000
max      10740.000000
Name: Token_Count, dtype: float64

In [120]:
data_filtered.loc[data_filtered['Token_Count'] > 5000, 'Token_Count']

3160    10740
Name: Token_Count, dtype: int64

In [121]:
data_filtered.drop(data_filtered.loc[data_filtered['Token_Count'] > 5000].index, inplace=True)
data_filtered.drop(data_filtered.loc[data_filtered['Body'].str.len() > 10000].index, inplace=True)

In [122]:
#Split to get 5000 questions
data_filtered = data_filtered[:5000]

In [123]:
data_filtered.reset_index(drop=True, inplace=True)

In [124]:
data_filtered

Unnamed: 0,Title,Body,Tags,Id,Text,Token_Count
0,how do I calculate a rolling idxmax,consider the pd.Series s\nimport pandas as pd\...,[python],40101130,how do I calculate a rolling idxmax consider t...,147
1,Better techniques for trimming leading zeros i...,I've been using this for some time:\nSUBSTRING...,"[sql, sql-server, string]",662383,Better techniques for trimming leading zeros i...,103
2,"""No X11 DISPLAY variable"" - what does it mean?",I am trying to install a Java application on m...,"[java, linux]",662421,"""No X11 DISPLAY variable"" - what does it mean?...",166
3,Object-Oriented Callbacks for C++?,Is there some library that allows me to easily...,[c++],3520133,Object-Oriented Callbacks for C++? Is there so...,206
4,Why doesn't .NET find the OpenSSL.NET dll?,"EDIT (the whole question, it was too unclear)\...","[c#, .net]",1396164,Why doesn't .NET find the OpenSSL.NET dll? EDI...,151
...,...,...,...,...,...,...
4995,Visual studio 2013 + .Net 4.5.1 + Edit and con...,Supposedly vs 2013 added support for edit and ...,"[.net, visual-studio]",20324080,Visual studio 2013 + .Net 4.5.1 + Edit and con...,195
4996,How to set a default parameter for a vector <s...,"For example, a class named Table, with its con...","[c++, string]",1854241,How to set a default parameter for a vector <s...,50
4997,Fast sine/cosine for ARMv7+NEON: looking for t...,Could somebody with access to an iPhone 3GS or...,[performance],1854254,Fast sine/cosine for ARMv7+NEON: looking for t...,264
4998,Quicksort superiority over Heap Sort,Heap Sort has a worst case complexity of O(nlo...,[algorithm],1853208,Quicksort superiority over Heap Sort Heap Sort...,28


In [125]:
data_filtered.shape

(5000, 6)

In [126]:
def tokenizer_fct(sentence) :
    # print(sentence)
    sentence_clean = sentence.replace('-', ' ').replace('+', ' ').replace('/', ' ').replace('#', ' ')
    word_tokens = word_tokenize(sentence_clean)
    return word_tokens

# Stop words
stop_w = list(set(stopwords.words('english'))) + ['[', ']', ',', '.', ':', '?', '(', ')', "'", '"', '!', ';', '``', "''", '...', '’', '“', '”']

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    filtered_w2 = [w for w in filtered_w if len(w) > 2]
    return filtered_w2

# lower case et alpha
def lower_start_fct(list_words) :
    lw = [w.lower() for w in list_words if (not w.startswith("@")) 
                                       and (not w.startswith("#"))
                                       and (not w.startswith("http"))]
    return lw

# Lemmatizer (base d'un mot)
def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w

# Fonction de préparation du texte pour le bag of words avec lemmatization
def transform_bow_lem_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lem_w)
    return transf_desc_text

# Fonction de préparation du texte pour le Deep learning (USE et BERT)
def transform_dl_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    #sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(word_tokens)
    # lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lw)
    return transf_desc_text

In [127]:
X = data_filtered['Title'].apply(transform_bow_lem_fct)
Y = data_filtered['Title'].apply(transform_dl_fct)

In [128]:
data_bow = pd.DataFrame(X)
data_bow['Body'] = data_filtered['Body'].apply(transform_bow_lem_fct)
data_bow['Text'] = data_filtered['Text'].apply(transform_bow_lem_fct)
data_bow['Tags'] = data_filtered['Tags']

In [129]:
data_se = pd.DataFrame(Y)
data_se['Body'] = data_filtered['Body'].apply(transform_dl_fct)
data_se['Text'] = data_filtered['Text'].apply(transform_dl_fct)
data_se['Tags'] = data_filtered['Tags']
data_se['Token_Count'] = data_filtered['Token_Count']

In [130]:
data_bow.head()

Unnamed: 0,Title,Body,Text,Tags
0,calculate rolling idxmax,consider pd.series import panda import numpy n...,calculate rolling idxmax consider pd.series im...,[python]
1,better technique trimming leading zero sql server,'ve using time substring str_col patindex str_...,better technique trimming leading zero sql ser...,"[sql, sql-server, string]"
2,x11 display variable mean,trying install java application linux machine ...,x11 display variable mean trying install java ...,"[java, linux]"
3,object oriented callback,library allows easily conveniently create obje...,object oriented callback library allows easily...,[c++]
4,why n't .net find openssl.net dll,edit whole question unclear want use openssl.n...,why n't .net find openssl.net dll edit whole q...,"[c#, .net]"


In [131]:
# Split train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_bow[['Title','Text']], data_bow['Tags'], test_size=0.2, random_state=0)
X_train_se, X_test_se, y_train_se, y_test_se = train_test_split(data_se[['Text','Body']], data_se['Tags'], test_size=0.2, random_state=0)

In [132]:
X_train_se

Unnamed: 0,Text,Body
2913,using visual studio code tasks to automate c m...,i have a project written in c that has two mak...
3275,how to automatically append text to text copie...,"in javascript , how can you select text on a w..."
775,get list of apps of all users if i want to ret...,if i want to retrieve an applicationinfo list ...
217,"if two languages follow ieee 754 , will calcul...",i 'm in the process of converting a program fr...
1245,sqlalchemy : how to make an integer column aut...,i am using flask extension for sqlalchemy to d...
...,...,...
4931,"wpf : how to detect key repetition , in key * ...",note : e.isrepeat is confirmed to work . the p...
3264,what is the most compatible way to install pyt...,i 'm starting to learn python and loving it . ...
1653,junit5 : how to assert several properties of a...,i want to assert several properties of an obje...
2607,browser caching in asp.net application any sug...,any suggestions on how to do browser caching w...


In [133]:
X_train

Unnamed: 0,Title,Text
2913,using visual studio code task automate makefil...,using visual studio code task automate makefil...
3275,how automatically append text text copied java...,how automatically append text text copied java...
775,get list apps user,get list apps user want retrieve applicationin...
217,two language follow ieee 754 calculation langu...,two language follow ieee 754 calculation langu...
1245,sqlalchemy how make integer column auto_increm...,sqlalchemy how make integer column auto_increm...
...,...,...
4931,wpf how detect key repetition key event,wpf how detect key repetition key event note e...
3264,what compatible way install python module mac,what compatible way install python module mac ...
1653,junit5 how assert several property object sing...,junit5 how assert several property object sing...
2607,browser caching asp.net application,browser caching asp.net application any sugges...


In [134]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train_encoded = mlb.fit_transform(y_train)
y_test_encoded = mlb.transform(y_test)

In [135]:
mlb.classes_.shape

(50,)

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer

ctf = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2)
ctf_fit = ctf.fit(X_train['Title'])
ctf_text_train = ctf.transform(X_train['Text'])
ctf_text_test = ctf.transform(X_test['Text'])

In [137]:
len(ctf_fit.vocabulary_)

2267

In [138]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [139]:
ovr = OneVsRestClassifier(LogisticRegression(), n_jobs=-3)
ovr.fit(ctf_text_train, y_train_encoded)
y_pred_test_ctf = ovr.predict(ctf_text_test)

In [140]:
from sklearn.metrics import jaccard_score

In [141]:
score_ctf = jaccard_score(y_test_encoded, y_pred_test_ctf, average='weighted')

print("Jaccard Score ctf:", score_ctf)

Jaccard Score ctf: 0.14732754533162903


In [142]:
"""from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline

# Créer un classificateur OvR avec une régression logistique
ovr = OneVsRestClassifier(LogisticRegression())

# Créer un pipeline avec TfidfVectorizer et OneVsRestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', ovr)
])

# Définir les paramètres de la grille à optimiser
param_grid = [
    {
        'tfidf__max_df': [0.9, 1.0],
        'tfidf__min_df': [1, 2],
        'tfidf__stop_words': [None, 'english'],
        'clf__estimator__C': [0.1, 1, 10],
        'clf__estimator__penalty': ['l1'],
        'clf__estimator__solver': ['liblinear', 'saga'],
    },
    {
        'tfidf__max_df': [0.9, 1.0],
        'tfidf__min_df': [1, 2],
        'tfidf__stop_words': [None, 'english'],
        'clf__estimator__C': [0.1, 1, 10],
        'clf__estimator__penalty': ['l2'],
        'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear'],
    },
]

# Utiliser le score de Jaccard pondéré pour l'évaluation
jaccard_scorer = make_scorer(jaccard_score, average='weighted')

# Créer l'objet GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, scoring=jaccard_scorer, cv=5, n_jobs=-3)

# Ajuster la recherche par grille aux données
grid_search.fit(X_train['Text'], y_train_encoded)

# Afficher les meilleurs paramètres
print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)"""

'from sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import make_scorer\nfrom sklearn.pipeline import Pipeline\n\n# Créer un classificateur OvR avec une régression logistique\novr = OneVsRestClassifier(LogisticRegression())\n\n# Créer un pipeline avec TfidfVectorizer et OneVsRestClassifier\npipeline = Pipeline([\n    (\'tfidf\', TfidfVectorizer()),\n    (\'clf\', ovr)\n])\n\n# Définir les paramètres de la grille à optimiser\nparam_grid = [\n    {\n        \'tfidf__max_df\': [0.9, 1.0],\n        \'tfidf__min_df\': [1, 2],\n        \'tfidf__stop_words\': [None, \'english\'],\n        \'clf__estimator__C\': [0.1, 1, 10],\n        \'clf__estimator__penalty\': [\'l1\'],\n        \'clf__estimator__solver\': [\'liblinear\', \'saga\'],\n    },\n    {\n        \'tfidf__max_df\': [0.9, 1.0],\n        \'tfidf__min_df\': [1, 2],\n        \'tfidf__stop_words\': [None, \'english\'],\n        \'clf__estimator__C\': [0.1, 1, 10],\n        \'clf__estimator__penalty\': [\'l2\'],\n    

best_params = {'clf__estimator__C': 10, 'clf__estimator__penalty': 'l1', 'clf__estimator__solver': 'liblinear', 'tfidf__max_df': 0.9, 'tfidf__min_df': 1, 'tfidf__stop_words': None}

In [143]:
# Appliquer les meilleurs paramètres pour TfidfVectorizer
tfidf = TfidfVectorizer(max_df=1.0, min_df=1, stop_words=None)

# Transformer les données d'entraînement et de test
X_train_tfidf = tfidf.fit_transform(X_train['Text'])
X_test_tfidf = tfidf.transform(X_test['Text'])

In [144]:
# Créer le classificateur OvR en utilisant les meilleurs paramètres
ovr_best = OneVsRestClassifier(LogisticRegression(C=10, penalty='l1', solver='liblinear'))

# Entraîner le classificateur avec les données d'entraînement transformées
ovr_best.fit(X_train_tfidf, y_train_encoded)

OneVsRestClassifier(estimator=LogisticRegression(C=10, penalty='l1',
                                                 solver='liblinear'))

In [145]:
jaccard_score(y_test_encoded, ovr_best.predict(X_test_tfidf), average='weighted')

0.422577021901656

In [146]:
import random
from termcolor import colored


# Utiliser predict_proba pour obtenir les probabilités des tags
y_pred_proba = ovr_best.predict_proba(X_test_tfidf)

# Obtenir les vrais tags pour les données de test
y_test_tags = mlb.inverse_transform(y_test_encoded)

# Appliquer un seuil pour les probabilités
threshold = 0.1

# Afficher les prédictions et les vrais tags pour 10 exemples pris au hasard
num_examples = 10
random_examples = random.sample(range(len(X_test)), num_examples)

for i in random_examples:
    print(f"Exemple {i + 1}:")
    print(f"Texte  : {X_test.iloc[i]['Text']}")
    print(f"Vrais tags  : {' | '.join(y_test_tags[i])}")

    # Extraire les tags prédits au-dessus du seuil avec leurs probabilités
    predicted_tags_with_probs = [(mlb.classes_[j], prob) for j, prob in enumerate(y_pred_proba[i]) if prob > threshold]
    
    # Ajouter les tags réels qui ne sont pas déjà dans les tags prédits
    for tag in y_test_tags[i]:
        if tag not in [t[0] for t in predicted_tags_with_probs]:
            prob = y_pred_proba[i][mlb.classes_.tolist().index(tag)]
            predicted_tags_with_probs.append((tag, prob))

    # Trier les tags prédits et réels par popularité et les afficher avec leurs probabilités
    sorted_tags_with_probs = sorted(predicted_tags_with_probs, key=lambda x: x[1], reverse=True)
    
    colored_tags = []
    for tag, prob in sorted_tags_with_probs:
        if tag in y_test_tags[i] and prob > threshold:
            colored_tags.append(colored(f"{tag} ({prob:.3f})", 'green'))
        elif tag in y_test_tags[i]:
            colored_tags.append(colored(f"{tag} ({prob:.3f})", 'red'))
        else:
            colored_tags.append(colored(f"{tag} ({prob:.3f})", 'yellow'))
    
    sorted_tags_with_probs_str = ' | '.join(colored_tags)
    
    print(f"Tags prédits et réels  : {sorted_tags_with_probs_str}")
    print("-" * 50)

Exemple 469:
Texte  : button inside anchor link work firefox internet explorer everything else site seems compatible browser except link they appear page work code link follows bgcolor= ffffff height= 370 valign= top width= 165 href= sc3.html button style= width:120 height:25 super chem button href= 91hollywood.html button style= width:120 height:25 hollywood button href= sbubba.html button style= width:120 height:25 super bubba button href= afgoohash.html button style= width:120 height:25 afgoo hash button href= superjack.html button style= width:120 height:25 super jack button href= sog.html button style= width:120 height:25 sugar button href= 91pk91.html button style= width:120 height:25 button href= jedi1.html button style= width:120 height:25 jedi button nbsp href= indynile99.blogspot.com button style= width:120 height:25 blog button nbsp
Vrais tags  : html
Tags prédits et réels  : [32mhtml (0.969)[0m | [33mjavascript (0.717)[0m | [33mcss (0.384)[0m | [33mjquery (0.103)[0m

Partie 2 : GPT-4

In [147]:
import openai
from openai.embeddings_utils import get_embedding

In [148]:
openai.api_key="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"
openai.organization = "org-NDvcRsf1UQXO6s4mUTH3miiA"

In [149]:
texte = "Exemple de texte pour lequel on souhaite obtenir un embedding"
len(get_embedding(texte))

12288

In [150]:
data_se.loc[data_se['Body'].str.len() > 10000]

Unnamed: 0,Title,Body,Text,Tags,Token_Count


In [151]:
data_se["Token_Count"].sum()

860951

In [152]:
data_se.loc[data_se["Token_Count"] > 5000]

Unnamed: 0,Title,Body,Text,Tags,Token_Count


In [153]:
X_train_se

Unnamed: 0,Text,Body
2913,using visual studio code tasks to automate c m...,i have a project written in c that has two mak...
3275,how to automatically append text to text copie...,"in javascript , how can you select text on a w..."
775,get list of apps of all users if i want to ret...,if i want to retrieve an applicationinfo list ...
217,"if two languages follow ieee 754 , will calcul...",i 'm in the process of converting a program fr...
1245,sqlalchemy : how to make an integer column aut...,i am using flask extension for sqlalchemy to d...
...,...,...
4931,"wpf : how to detect key repetition , in key * ...",note : e.isrepeat is confirmed to work . the p...
3264,what is the most compatible way to install pyt...,i 'm starting to learn python and loving it . ...
1653,junit5 : how to assert several properties of a...,i want to assert several properties of an obje...
2607,browser caching in asp.net application any sug...,any suggestions on how to do browser caching w...


In [154]:
data_se.iloc[0]["Text"]

"how do i calculate a rolling idxmax consider the pd.series s import pandas as pd import numpy as np np.random.seed ( [ 3,1415 ] ) s = pd.series ( np.random.randint ( 0 , 10 , 10 ) , list ( 'abcdefghij ' ) ) s a 0 b 2 c 7 d 3 e 8 f 7 g 0 h 6 i 8 j 6 dtype : int64 i want to get the index for the max value for the rolling window of 3 s.rolling ( 3 ) .max ( ) a nan b nan c 7.0 d 7.0 e 8.0 f 8.0 g 8.0 h 7.0 i 8.0 j 8.0 dtype : float64 what i want is a none b none c c d c e e f e g e h f i i j i dtype : object what i 've done s.rolling ( 3 ) .apply ( np.argmax ) a nan b nan c 2.0 d 1.0 e 2.0 f 1.0 g 0.0 h 0.0 i 2.0 j 1.0 dtype : float64 which is obviously not what i want"

In [155]:
"""X_train_se['Embedding_body'] = None
embedding_model = "text-embedding-ada-002"
for i in range(len(X_train_se)):
    embedding = get_embedding(X_train_se.iloc[i]["Body"], engine=embedding_model)
    X_train_se.iat[i, X_train_se.columns.get_loc("Embedding_body")] = embedding"""

'X_train_se[\'Embedding_body\'] = None\nembedding_model = "text-embedding-ada-002"\nfor i in range(len(X_train_se)):\n    embedding = get_embedding(X_train_se.iloc[i]["Body"], engine=embedding_model)\n    X_train_se.iat[i, X_train_se.columns.get_loc("Embedding_body")] = embedding'

In [156]:
"""X_test_se['Embedding_body'] = None
embedding_model = "text-embedding-ada-002"
for i in range(len(X_test_se)):
    embedding = get_embedding(X_test_se.iloc[i]["Body"], engine=embedding_model)
    X_test_se.iat[i, X_test_se.columns.get_loc("Embedding_body")] = embedding"""

'X_test_se[\'Embedding_body\'] = None\nembedding_model = "text-embedding-ada-002"\nfor i in range(len(X_test_se)):\n    embedding = get_embedding(X_test_se.iloc[i]["Body"], engine=embedding_model)\n    X_test_se.iat[i, X_test_se.columns.get_loc("Embedding_body")] = embedding'

In [157]:
"""#Save X_test_se and X_train_se
X_test_se.to_csv("data/X_test_se.csv", index=False)
X_train_se.to_csv("data/X_train_se.csv", index=False)"""

'#Save X_test_se and X_train_se\nX_test_se.to_csv("data/X_test_se.csv", index=False)\nX_train_se.to_csv("data/X_train_se.csv", index=False)'

In [158]:
#Load X_test_se and X_train_se
X_test_se = pd.read_csv("data/X_test_se.csv")
X_train_se = pd.read_csv("data/X_train_se.csv")

In [159]:
X_train_se

Unnamed: 0,Text,Body,Embedding_body
0,using visual studio code tasks to automate c m...,i have a project written in c that has two mak...,"[-0.031667113304138184, 0.006912615150213242, ..."
1,how to automatically append text to text copie...,"in javascript , how can you select text on a w...","[-0.007474603597074747, 0.010974351316690445, ..."
2,get list of apps of all users if i want to ret...,if i want to retrieve an applicationinfo list ...,"[-0.01088230311870575, 0.016354195773601532, -..."
3,"if two languages follow ieee 754 , will calcul...",i 'm in the process of converting a program fr...,"[-0.008218037895858288, -0.0018978244625031948..."
4,sqlalchemy : how to make an integer column aut...,i am using flask extension for sqlalchemy to d...,"[-0.005141154397279024, 0.024073658511042595, ..."
...,...,...,...
3995,"wpf : how to detect key repetition , in key * ...",note : e.isrepeat is confirmed to work . the p...,"[-0.026899250224232674, -0.009185451082885265,..."
3996,what is the most compatible way to install pyt...,i 'm starting to learn python and loving it . ...,"[0.006229415535926819, -0.017867524176836014, ..."
3997,junit5 : how to assert several properties of a...,i want to assert several properties of an obje...,"[-0.015963532030582428, 0.026817653328180313, ..."
3998,browser caching in asp.net application any sug...,any suggestions on how to do browser caching w...,"[-0.01717955991625786, 0.03082687221467495, 0...."


In [160]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train_se = mlb.fit_transform(y_train_se)
y_test_se = mlb.transform(y_test_se)

In [161]:
import numpy as np
import ast

def convert_embedding_string_to_array(embedding_string):
    # Convertir la chaîne en liste
    embedding_list = ast.literal_eval(embedding_string)
    # Convertir la liste en tableau numpy
    return np.array(embedding_list)

In [162]:
# Convertir les embeddings de chaînes de caractères en tableaux numpy
X_train_se['Embedding_body'] = X_train_se['Embedding_body'].apply(convert_embedding_string_to_array)
X_test_se['Embedding_body'] = X_test_se['Embedding_body'].apply(convert_embedding_string_to_array)

# Empilez les embeddings dans des tableaux numpy pour l'entraînement
X_train_embeddings = np.stack(X_train_se['Embedding_body'].values)
X_test_embeddings = np.stack(X_test_se['Embedding_body'].values)

In [163]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np


# Sélectionner un modèle
model = OneVsRestClassifier(LogisticRegression(C=10, penalty='l1', solver='liblinear'))

# Entraîner le modèle
model.fit(X_train_embeddings, y_train_se)

# Faire des prédictions sur l'ensemble de test
y_pred = model.predict(X_test_embeddings)

0.4660558971908695

In [165]:
# Évaluer le modèle
jaccard_score(y_test_se, y_pred, average='weighted')

0.4660558971908695

In [166]:
import random
from termcolor import colored


# Utiliser predict_proba pour obtenir les probabilités des tags
y_pred_proba = model.predict_proba(X_test_embeddings)

# Obtenir les vrais tags pour les données de test
y_test_tags = mlb.inverse_transform(y_test_se)

# Appliquer un seuil pour les probabilités
threshold = 0.1

for i in random_examples:
    print(f"Exemple {i + 1}:")
    print(f"Texte  : {X_test_se.iloc[i]['Text']}")
    print(f"Vrais tags  : {' | '.join(y_test_tags[i])}")

    # Extraire les tags prédits au-dessus du seuil avec leurs probabilités
    predicted_tags_with_probs = [(mlb.classes_[j], prob) for j, prob in enumerate(y_pred_proba[i]) if prob > threshold]
    
    # Ajouter les tags réels qui ne sont pas déjà dans les tags prédits
    for tag in y_test_tags[i]:
        if tag not in [t[0] for t in predicted_tags_with_probs]:
            prob = y_pred_proba[i][mlb.classes_.tolist().index(tag)]
            predicted_tags_with_probs.append((tag, prob))

    # Trier les tags prédits et réels par popularité et les afficher avec leurs probabilités
    sorted_tags_with_probs = sorted(predicted_tags_with_probs, key=lambda x: x[1], reverse=True)
    
    colored_tags = []
    for tag, prob in sorted_tags_with_probs:
        if tag in y_test_tags[i] and prob > threshold:
            colored_tags.append(colored(f"{tag} ({prob:.3f})", 'green'))
        elif tag in y_test_tags[i]:
            colored_tags.append(colored(f"{tag} ({prob:.3f})", 'red'))
        else:
            colored_tags.append(colored(f"{tag} ({prob:.3f})", 'yellow'))
    
    sorted_tags_with_probs_str = ' | '.join(colored_tags)
    
    print(f"Tags prédits et réels  : {sorted_tags_with_probs_str}")
    print("-" * 50)

Exemple 469:
Texte  : button inside of anchor link works in firefox but not in internet explorer ? everything else in my site seems to be compatible with all browsers except for my links . they appear on the page , but they do not work . my code for the links are as follows < td bgcolor= '' ffffff '' height= '' 370 '' valign= '' top '' width= '' 165 '' > < p > < a href= '' sc3.html '' > < button style= '' width:120 ; height:25 '' > super chem 3 < button > < a > < a href= '' 91hollywood.html '' > < button style= '' width:120 ; height:25 '' > 91 hollywood < button > < a > < a href= '' sbubba.html '' > < button style= '' width:120 ; height:25 '' > super bubba < button > < a > < a href= '' afgoohash.html '' > < button style= '' width:120 ; height:25 '' > afgoo hash < button > < a > < a href= '' superjack.html '' > < button style= '' width:120 ; height:25 '' > super jack < button > < a > < a href= '' sog.html '' > < button style= '' width:120 ; height:25 '' > sugar og < button > < a > < a h