In [None]:
import pandas as pd
import numpy as np
import requests
from io import BytesIO
import re
from bs4 import BeautifulSoup
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')
from nltk.corpus import stopwords

url = 'https://gitlab.com/rajacsp/datasets/raw/master/stack-overflow-data.csv'
response = requests.get(url)
df = pd.read_csv(BytesIO(response.content))

df = df[pd.notnull(df['tags'])]

REPLACE_BY_SPACE_RE = re.compile(r'[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

df['post'] = df['post'].apply(clean_text)

X = df['post']
y = df['tags']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

my_tags = [
    'java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php',
    'ios','javascript','python','c','css','android','iphone','sql',
    'objective-c','c++','angularjs','.net'
]

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=my_tags))
print("Accuracy:", accuracy_score(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stefi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
