In [34]:
from sklearn.datasets import fetch_20newsgroups

category_map = {'misc.forsale': 'Sales', 'rec.motorcycles': 'Motorcycles','rec.sport.baseball': 'Baseball', \
                'sci.crypt': 'Cryptography','sci.space': 'Space'}

training_data = fetch_20newsgroups(subset='train',categories=category_map.keys(), shuffle=True, random_state=7)

In [35]:
set(training_data.target)

{0, 1, 2, 3, 4}

In [36]:
# Target categories to predict

[training_data.target_names[i] for i in range(0,5)]

['misc.forsale',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.crypt',
 'sci.space']

In [37]:
# Feature extraction using 'Count vectorizer' and 'TfIdf'
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
X_train_termcounts = vectorizer.fit_transform(training_data.data)
print("\nDimensions of training data:", X_train_termcounts.shape)

# tf-idf transformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_termcounts)


Dimensions of training data: (2968, 40605)


In [38]:
from sklearn.naive_bayes import MultinomialNB

# Multinomial Naive Bayes classifier
classifier = MultinomialNB().fit(X_train_tfidf, training_data.target)

# Input data for Prediction
input_data = [
    "The curveballs of right handed pitchers tend to curve to the left", 
    "Caesar cipher is an ancient form of encryption",
    "This two-wheeler is really good on slippery roads"
]

X_input_termcounts = vectorizer.transform(input_data)
X_input_tfidf = tfidf_transformer.transform(X_input_termcounts)

# Predict the output categories
predicted_categories = classifier.predict(X_input_tfidf)

# Print the outputs
for sentence, category in zip(input_data, predicted_categories):
    print('Input:', sentence, '\nPredicted category:',category_map[training_data.target_names[category]])

Input: The curveballs of right handed pitchers tend to curve to the left 
Predicted category: Baseball
Input: Caesar cipher is an ancient form of encryption 
Predicted category: Cryptography
Input: This two-wheeler is really good on slippery roads 
Predicted category: Motorcycles
