In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
import sklearn.metrics

In [2]:
# Used dataset is proposed by Kaggle it can be found at https://www.kaggle.com/danofer/sample-blog-authors/data 
# Loading dataset

dataset_file_name = 'blogtext.csv'

raw_df = pd.read_csv(dataset_file_name)

In [3]:
raw_df.topic.unique()

array(['Student', 'InvestmentBanking', 'indUnk', 'Non-Profit', 'Banking',
       'Education', 'Engineering', 'Science', 'Communications-Media',
       'BusinessServices', 'Sports-Recreation', 'Arts', 'Internet',
       'Museums-Libraries', 'Accounting', 'Technology', 'Law',
       'Consulting', 'Automotive', 'Religion', 'Fashion', 'Publishing',
       'Marketing', 'LawEnforcement-Security', 'HumanResources',
       'Telecommunications', 'Military', 'Government', 'Transportation',
       'Architecture', 'Advertising', 'Agriculture', 'Biotech',
       'RealEstate', 'Manufacturing', 'Construction', 'Chemicals',
       'Maritime', 'Tourism', 'Environment'], dtype=object)

In [4]:
topics = ['Engineering', 'Arts', 'Fashion', 'Tourism']


cleaned_df = raw_df[raw_df['topic'].isin(topics)]
cleaned_df = cleaned_df[['topic','text']]


cleaned_df.head(10)

Unnamed: 0,topic,text
573,Engineering,If I had a band I'd call it '...
574,Engineering,"Well... I hate to say it, bu..."
575,Engineering,How do we move on now? How d...
576,Engineering,Humor is my primary tool for ...
577,Engineering,Warning: If you know me and a...
578,Engineering,...Our Program already in Pro...
579,Engineering,So here's two for ya... and t...
580,Engineering,If you could live anywhere in...
581,Engineering,I had a lot of fun this weeke...
582,Engineering,Well I eat too much Chinese f...


In [5]:
train, test = train_test_split(cleaned_df, test_size=0.2)

In [28]:
# Logistic regression approach

reg_text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english', lowercase=False)),
    ('tfidf', TfidfTransformer()),
    ('reg', SGDClassifier(learning_rate='optimal', loss='hinge', max_iter=10000, penalty='l2'))
])

reg_text_clf.fit(train.text, train.topic)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [29]:
# Prediction on the whole test dataset

reg_predicted = reg_text_clf.predict(test.text)
sklearn.metrics.f1_score(test.topic, reg_predicted, average='micro')

0.719029374201788

In [30]:
text_to_predict = reg_text_clf.predict([
    'Engineering is the creative application of science, mathematical methods, and empirical evidence to the innovation, design, construction, operation and maintenance of structures, machines, materials, devices, systems, processes, and organizations. The discipline of engineering encompasses a broad range of more specialized fields of engineering, each with a more specific emphasis on particular areas of applied mathematics, applied science, and types of application. See glossary of engineering',
    'Art is a creative activity that expresses imaginative or technical skill. It produces an artifact, also called a work of art, for others to experience. Those who do this are called artists. They hope to affect the emotions of people who experience it. Some people find art relaxing, or exciting, or informative. Many people disagree on how to define art. Some say people are driven to make art due to their inner creativity.'
])

for i in range(0, len(text_to_predict)):
    print(text_to_predict[i])

Engineering
Arts
