# Text classification with sklearn

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
all_train = fetch_20newsgroups(subset='train')

In [3]:
all_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
 print("\n".join(all_train.data[0].split("\n")[:10]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 


In [5]:
categories = ['rec.autos', 'rec.sport.baseball', 'sci.space']

In [6]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [7]:
newsgroups_train.data[0][:500]

"\nThe Centaur that is being built for T4 would be a better bet to integrate \nonto the Proton as the T4/Centaur is designed for the Extremely Harsh \nenvorinment of the T4 launch. It is also closer to 4 m in diameter. \n\nYou've hit on the real kicker, however. The Centaur is pressure stabilized. \nIt cannot hold up its own weight without pressure in the tanks. Additionally, \nthe pressure difference between the two tanks must be maintained to ~+/- 5 psi. \nThat is rather tight to be rocking and rolling"

## Vectorization with CountVectorizer

In [8]:
# example 1

n_features = 1000
count_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05,
                                max_features=n_features,
                                stop_words='english')

train_count_vectorizer = count_vectorizer.fit_transform(newsgroups_train.data)
test_count_vectorizer = count_vectorizer.transform(newsgroups_test.data)

clf = LogisticRegression(random_state=0).fit(train_count_vectorizer, newsgroups_train.target)
predicted = clf.predict(test_count_vectorizer)

print(classification_report(newsgroups_test.target, predicted))

              precision    recall  f1-score   support

           0       0.69      0.60      0.64       396
           1       0.60      0.79      0.68       397
           2       0.70      0.57      0.63       394

    accuracy                           0.65      1187
   macro avg       0.66      0.65      0.65      1187
weighted avg       0.66      0.65      0.65      1187



In [9]:
# example 2

count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))

train_count_vectorizer = count_vectorizer.fit_transform(newsgroups_train.data)
test_count_vectorizer = count_vectorizer.transform(newsgroups_test.data)

clf = LogisticRegression(random_state=0).fit(train_count_vectorizer, newsgroups_train.target)
predicted = clf.predict(test_count_vectorizer)

print(classification_report(newsgroups_test.target, predicted))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85       396
           1       0.88      0.87      0.87       397
           2       0.93      0.77      0.84       394

    accuracy                           0.86      1187
   macro avg       0.87      0.86      0.86      1187
weighted avg       0.87      0.86      0.86      1187



## Vectorization with TFIDF

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_df=500, min_df=10)

tfidf_train = tfidf_vectorizer.fit_transform(newsgroups_train.data)
tfidf_test =  tfidf_vectorizer.transform(newsgroups_test.data)

clf = LogisticRegression().fit(tfidf_train, newsgroups_train.target)

predicted = clf.predict(tfidf_test)
print(classification_report(newsgroups_test.target, predicted))

              precision    recall  f1-score   support

           0       0.87      0.84      0.85       396
           1       0.84      0.90      0.87       397
           2       0.88      0.85      0.86       394

    accuracy                           0.86      1187
   macro avg       0.86      0.86      0.86      1187
weighted avg       0.86      0.86      0.86      1187

