# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [1]:
from pprint import pprint
from time import time
import logging
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

### Choose a few categories fro the entire 20 categories

In [2]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [3]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetch documents for these 2 categories

In [4]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



### Define a pipeline combining a text feature extractor with a simple classifier

In [5]:
algorithms = {
    'SGDClassifier': SGDClassifier(tol=1e-3),
    'Multinomial Naïve Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machines': SVC(),
    'Decision Trees': DecisionTreeClassifier()
}

In [6]:
for algo_name, algorithm in algorithms.items():
    print(f"Using algorithm: {algo_name}")

Using algorithm: SGDClassifier
Using algorithm: Multinomial Naïve Bayes
Using algorithm: Logistic Regression
Using algorithm: Support Vector Machines
Using algorithm: Decision Trees


In [7]:
# # Voting Classifier
# ensemble_clf = VotingClassifier(
#     estimators=[
#         ('sgd', SGDClassifier(tol=1e-3)),
#         ('nb', MultinomialNB()),
#         ('logreg', LogisticRegression()),  # Add Logistic Regression
#         ('svm', SVC(probability=True)),
#         ('tree', DecisionTreeClassifier())
#     ],
#     voting='hard'
# )

In [8]:
pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', algorithm),
    ])

### Specify parameter grid
- 'vect__max_df': (0.5, 0.75, 1.0)
- 'vect__max_features': (None, 5000, 10000, 50000)
- 'vect__ngram_range': ((1, 1), (1, 2))
- 'tfidf__use_idf': (True, False)
- 'tfidf__norm': ('l1', 'l2')
- 'clf__max_iter': (20,)
- 'clf__alpha': (0.00001, 0.000001)
- 'clf__penalty': ('l2', 'elasticnet')
- 'clf__max_iter': (10, 50, 80)

In [9]:
parameters = {}

if algo_name == 'SGDClassifier':
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__max_iter': (20,),
        'clf__alpha': (0.00001, 0.000001),
        'clf__penalty': ('l2', 'elasticnet'),
    }
elif algo_name == 'Multinomial Naïve Bayes':
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
    }
elif algo_name == 'Logistic Regression':
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__max_iter': (20,),
        'clf__C': (1, 0.1, 0.01),
    }
elif algo_name == 'Decision Trees':
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__criterion': ('gini', 'entropy'),
        'clf__splitter': ('best', 'random'),
        'clf__max_depth': (None, 10, 20, 30),
        'clf__min_samples_split': (2, 5, 10),
        'clf__min_samples_leaf': (1, 2, 4),
        'clf__max_features': (None, 'sqrt', 'log2'),
    }
elif algo_name == 'Support Vector Machines':
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__C': (1, 0.1, 0.01),
    }

### Find the best parameters for both the feature extraction and the classifier

### Build a GridSearch with the pipeline and parameter grid

In [10]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                           n_jobs=-1, verbose=1)

### Start the grid search

In [11]:
grid_search.fit(data.data, data.target)

Fitting 5 folds for each of 41472 candidates, totalling 207360 fits


### Best Score

In [12]:
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.907


In [19]:
all_scores = {}
all_scores[algo_name] = grid_search.cv_results_['mean_test_score']
print(f"Scores for {algo_name}: {all_scores[algo_name]}")
print("All Scores:")
for algo_name, scores in all_scores.items():
    print(f"{algo_name}: {scores}")

Scores for Decision Trees: [0.84598123 0.84133007 0.850612   ... 0.63246294 0.57180063 0.56825785]
All Scores:
Decision Trees: [0.84598123 0.84133007 0.850612   ... 0.63246294 0.57180063 0.56825785]


### Best Parameter

In [13]:
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set:
	clf__criterion: 'entropy'
	clf__max_depth: 20
	clf__max_features: None
	clf__min_samples_leaf: 1
	clf__min_samples_split: 2
	clf__splitter: 'random'
	tfidf__norm: 'l1'
	tfidf__use_idf: False
	vect__max_df: 0.75
	vect__max_features: 10000
	vect__ngram_range: (1, 2)


In [14]:
print("Best algorithm: ", grid_search.best_estimator_.named_steps['clf'])
print("Best parameters: ", grid_search.best_params_)

Best algorithm:  DecisionTreeClassifier(criterion='entropy', max_depth=20, splitter='random')
Best parameters:  {'clf__criterion': 'entropy', 'clf__max_depth': 20, 'clf__max_features': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__splitter': 'random', 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 0.75, 'vect__max_features': 10000, 'vect__ngram_range': (1, 2)}


### Choose the best model

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [23]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

In [24]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print("Classification Report:\n", report)

Accuracy: 0.919
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.90      0.92        93
           1       0.89      0.94      0.91        79

    accuracy                           0.92       172
   macro avg       0.92      0.92      0.92       172
weighted avg       0.92      0.92      0.92       172



In [20]:
# # Train the ensemble model
# ensemble_clf.fit(X_train, y_train)

# # Evaluate the ensemble model
# y_pred_ensemble = ensemble_clf.predict(X_test)
# accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
# report_ensemble = classification_report(y_test, y_pred_ensemble)

# print(f"Accuracy of the Ensemble Model: {accuracy_ensemble:.3f}")
# print("Classification Report for the Ensemble Model:\n", report_ensemble)

### Use the model to classify a piece of text

In [25]:
# Predict categories for all documents in the dataset
predicted_categories = grid_search.best_estimator_.predict(data.data)

# Print the predicted categories for the first few documents
for i in range(5):
    print(f"Document {i + 1} - Predicted category: {data.target_names[predicted_categories[i]]}")

Document 1 - Predicted category: alt.atheism
Document 2 - Predicted category: talk.religion.misc
Document 3 - Predicted category: alt.atheism
Document 4 - Predicted category: talk.religion.misc
Document 5 - Predicted category: talk.religion.misc


In [26]:
# Access the first document in the training set
first_document = data.data[1]

# Print the content of the first document
print("Content of the first document:")
print(first_document)

Content of the first document:
Subject: Re: There must be a creator! (Maybe)
From: halat@pooh.bears (Jim Halat)
Reply-To: halat@pooh.bears (Jim Halat)
Lines: 24

In article <16BA1E927.DRPORTER@SUVM.SYR.EDU>, DRPORTER@SUVM.SYR.EDU (Brad Porter) writes:
>
>   Science is wonderful at answering most of our questions.  I'm not the type
>to question scientific findings very often, but...  Personally, I find the
>theory of evolution to be unfathomable.  Could humans, a highly evolved,
>complex organism that thinks, learns, and develops truly be an organism
>that resulted from random genetic mutations and natural selection?

[...stuff deleted...]

Computers are an excellent example...of evolution without "a" creator.
We did not "create" computers.  We did not create the sand that goes
into the silicon that goes into the integrated circuits that go into
processor board.  We took these things and put them together in an
interesting way. Just like plants "create" oxygen using light through 
photo

In [27]:
# Set the number of documents to randomly select
num_documents_to_select = 5  # You can adjust this number

# Randomly select documents based on their file names
random_documents_indices = random.sample(range(len(data.filenames)), num_documents_to_select)

# Predict categories for the randomly selected documents
for i in random_documents_indices:
    predicted_category = grid_search.best_estimator_.predict([data.data[i]])
    print(f"Document {i + 1} - Predicted category: {data.target_names[predicted_category[0]]}")

Document 815 - Predicted category: talk.religion.misc
Document 651 - Predicted category: alt.atheism
Document 382 - Predicted category: talk.religion.misc
Document 45 - Predicted category: talk.religion.misc
Document 591 - Predicted category: talk.religion.misc


In [28]:
# Set the number of documents to randomly select
num_documents_to_select = 5  # You can adjust this number

# Initialize a list to store document numbers
DocPrint = []

# Randomly select documents
random_documents_indices = random.sample(range(len(data.data)), num_documents_to_select)

# Predict categories for the randomly selected documents
for i in random_documents_indices:
    document_text = data.data[i]
    predicted_category = grid_search.best_estimator_.predict([data.data[i]])
    print(f"Document {i + 1} - Predicted category: {data.target_names[predicted_category[0]]}")
    
    # Save document number, content, and predicted category to the list
    DocPrint.append((i + 1, document_text, data.target_names[predicted_category[0]]))    

Document 146 - Predicted category: talk.religion.misc
Document 461 - Predicted category: alt.atheism
Document 6 - Predicted category: alt.atheism
Document 42 - Predicted category: alt.atheism
Document 9 - Predicted category: alt.atheism


In [29]:
# Print saved documents on another line
print("Selected Documents:")
for doc_number, doc_content, predicted_category in DocPrint:
    print(f"Document {doc_number} - Predicted category: {predicted_category}\nContent:\n{doc_content}\n")

Selected Documents:
Document 146 - Predicted category: talk.religion.misc
Content:
Subject: Re: Feminism and Islam, again
From: kmagnacca@eagle.wesleyan.edu
Organization: Wesleyan University
Nntp-Posting-Host: wesleyan.edu
Lines: 30

In article <1993Apr14.030334.8650@ultb.isc.rit.edu>, snm6394@ultb.isc.rit.edu (S.N. Mozumder ) writes:
> In article <1993Apr11.145519.1@eagle.wesleyan.edu> kmagnacca@eagle.wesleyan.edu writes:
>>
>>There's a way around that via the hadith, which state that silence is
>>taken to mean "yes" and that women may not speak before a judge, who
>>must conduct the marriage.
> 
> Actaully, that's a false hadith, because it contradicts verses in the
> Quran, that says women may testify- speak before a judge.
> 
> Hadiths are declared false when they contradict the Quran.  Hadiths
> weren't written during the revelation or during the life of the prophet,
> and so may contain errors.

So the only way you can tell a false hadith from a true one is
if it contradicts the 

In [82]:
# Create a list to store the results
results = []

# Print the predicted categories for the first few documents
for i in range(5):
    result = {
        "Document": i + 1,
        "Predicted category": data.target_names[predicted_categories[i]],
        "Content": None  # Placeholder for content
    }
    results.append(result)

In [83]:
# Append the content of the first document to the results
results.append({"Content of the First Document": first_document})

In [84]:
# Set the number of documents to randomly select
num_documents_to_select = 5  # You can adjust this number

# Randomly select documents based on their file names
random_documents_indices = random.sample(range(len(data.filenames)), num_documents_to_select)

# Predict categories for the randomly selected documents
for i in random_documents_indices:
    predicted_category = grid_search.best_estimator_.predict([data.data[i]])
    result = {
        "Document": i + 1,
        "Predicted category": data.target_names[predicted_category[0]],
        "Content": None  # Placeholder for content
    }
    results.append(result)

In [85]:
# Set the number of documents to randomly select
num_documents_to_select = 5  # You can adjust this number

# Initialize a list to store document numbers
DocPrint = []

# Randomly select documents
random_documents_indices = random.sample(range(len(data.data)), num_documents_to_select)

# Predict categories for the randomly selected documents
for i in random_documents_indices:
    document_text = data.data[i]
    predicted_category = grid_search.best_estimator_.predict([data.data[i]])
    result = {
        "Document": i + 1,
        "Predicted category": data.target_names[predicted_category[0]],
        "Content": document_text
    }
    DocPrint.append(result)

In [86]:
import pandas as pd

# Convert the results to a DataFrame
df_results = pd.DataFrame(results + DocPrint)

# Save the DataFrame to a CSV file
df_results.to_csv('output.csv', index=False)

In [89]:
# Open a text file to save the results with explicit encoding
with open('output.txt', 'w', encoding='utf-8') as txt_file:
    # ... (your existing code)

    # Print the results to the text file
    for result in results + DocPrint:
        print(result, file=txt_file)

In [87]:
# # Predict categories for all documents in the dataset using the ensemble model
# predicted_categories_ensemble = ensemble_clf.predict(data.data)

# # Print the predicted categories for the first few documents using the ensemble model
# print("Ensemble Predictions for All Documents:")
# for i in range(5):
#     print(f"Document {i + 1} - Predicted category: {data.target_names[predicted_categories_ensemble[i]]}")

In [None]:
# # Predict categories for the randomly selected documents using the ensemble model
# print("Ensemble Predictions for Selected Documents:")
# for i in random_documents_indices:
#     predicted_category_ensemble = ensemble_clf.predict([data.data[i]])
#     print(f"Document {i + 1} - Predicted category: {data.target_names[predicted_category_ensemble[0]]}")

In [None]:
# # Set the number of documents to randomly select
# num_documents_to_select = 5  # You can adjust this number

# # Initialize a list to store document numbers
# DocPrint_ensemble = []

# # Randomly select documents
# random_documents_indices = random.sample(range(len(data.data)), num_documents_to_select)

# # Predict categories for the randomly selected documents using the ensemble model
# for i in random_documents_indices:
#     document_text = data.data[i]
#     predicted_category_ensemble = ensemble_clf.predict([data.data[i]])
#     print(f"Document {i + 1} - Predicted category (Ensemble): {data.target_names[predicted_category_ensemble[0]]}")
    
#     # Save document number, content, and predicted category to the list
#     DocPrint_ensemble.append((i + 1, document_text, data.target_names[predicted_category_ensemble[0]]))

In [None]:
# # Print saved documents on another line using the ensemble model
# print("Selected Documents (Ensemble):")
# for doc_number, doc_content, predicted_category_ensemble in DocPrint_ensemble:
#     print(f"Document {doc_number} - Predicted category (Ensemble): {predicted_category_ensemble}\nContent:\n{doc_content}\n")