In [1]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
bbc = load_files(r"D:\BigData\notes\13_Apr_21\bbc")
X,y = bbc.data, bbc.target

In [3]:
print(bbc.target_names)

['business', 'entertainment', 'politics', 'sport', 'tech']


In [5]:
documents = []

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Stemming
    document = document.split()

    document = [stemmer.stem(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [6]:
# you can use one hot encoding or other method, max_feature:- it is controlling the process so that it does not overload . it will limit the words to 1500
# min_df = 5 , means it must be in aleast 5 document
# max_df = 0.7 means the word must be in maximum 70 percent of document.If it is present in more than 70 percent of document it will act as noise so we will not consider
# Apply TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

In [None]:
# Split data into training and testing partitions
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

In [None]:
# Apply Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
NBclassifier = GaussianNB()
NBclassifier.fit(X_train, y_train) 
y_pred = NBclassifier.predict(X_test)

In [None]:
# Test accuracy
print(bbc.target_names)
print()
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


In [None]:
# Apply Decision tree classifier
from sklearn.tree import DecisionTreeClassifier
DTclassifier = DecisionTreeClassifier()
DTclassifier.fit(X_train, y_train) 
y_pred = DTclassifier.predict(X_test)

In [None]:
#Test accuracy
print(bbc.target_names)
print()
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
# Apply Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test)

In [None]:
#Test accuracy
print(bbc.target_names)
print()
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))