In [None]:
'''
What is Text Classification ?
A fairly popular text classification task is to identify a body of text as either spam or not spam,
for things like email filters. In our case, we're going to try to create a sentiment analysis algorithm.
'''

In [1]:
import nltk
import random
from nltk.corpus import movie_reviews

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [3]:
random.shuffle(documents)

In [4]:
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

In [5]:
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [6]:
print(all_words["stupid"])

253


In [None]:
#Additional Example SMS Classification

In [7]:
import pandas as pd
dataset = pd.read_csv('data.csv', encoding='ISO-8859-1');

In [8]:
import re
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt 

[nltk_data] Downloading package punkt to C:\Users\susan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!pip install autocorrect

In [11]:
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

#spell correction
from autocorrect import Speller
eng_speller = Speller(lang='en')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\susan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
data = []

for i in range(dataset.shape[0]):
    sms = dataset.iloc[i, 1]

    # remove non alphabatic characters
    sms = re.sub('[^A-Za-z]', ' ', sms)

    # make words lowercase, because Go and go will be considered as two words
    sms = sms.lower()

    # tokenising
    tokenized_sms = wt(sms)

    # remove stop words and stemming
 
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(eng_speller(stemmer.stem(word)))

    sms_text = " ".join(sms_processed)
    data.append(sms_text)

In [13]:
# creating the feature matrix 
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(data).toarray()
y = dataset.iloc[:, 0]

In [14]:
# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
# predict class
y_pred = classifier.predict(X_test)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

In [17]:
print("Accuracy --->",accuracy)

Accuracy ---> 0.7753050969131371
