In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('stackoverflow-reduced.csv')


In [3]:
data.head()

Unnamed: 0,post,tags
0,php domnode nodevalue &gt; &lt issue i am try...,php
1,problems running a php script from command lin...,php
2,how to know or ensure that a php page is only ...,php
3,posting child objects to a database with php ...,php
4,how to fix this 2 issue by php (edit user from...,php


In [4]:
print("Input data has {} rows and {} columns".format(len(data), len(data.columns)))

Input data has 6433 rows and 2 columns


In [5]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
from bs4 import BeautifulSoup

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

In [6]:
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text) 
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['post'], data['tags'], test_size=0.2)

In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text, min_df=2, encoding='latin-1', ngram_range=(1, 2))
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())
X_train_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10529,10530,10531,10532,10533,10534,10535,10536,10537,10538
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.055488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.048495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.072397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [10]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

gb_model = gb.fit(X_train_vect, y_train)
y_pred = gb_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred)
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
accuracy = ((y_pred==y_test).sum()/len(y_pred))
print('Accuracy: {}'.format(accuracy))
    

Precision: [0.88823529 0.98832685 0.96610169 0.99324324 0.97826087]
Recall: [0.9869281  0.94074074 0.9047619  0.94533762 0.98540146]
Accuracy: 0.9588189588189588
