# Text classification 


__Data__: Text samples belonging to various categories<br>
__Data source:__ Collected by the team by scrapping text samples from various articles,blogs and websites. 

With the aim to classify future Queries based on its content, we used different machine learning algorithms can make more accurate predictions (i.e., classify the Query in one of the product categories).

# Table of Content

* [Goal](#obj)
* [Importing packages and loading data](#imp)
* [Exploratory Data Analysis (EDA) and Feature Engineering](#eda)
* [Text Preprocessing](#pre)
* [Multi-Classification models](#ml)
    * [Spliting the data: train and test](#sp)
    * [Models](#m)
* [Comparison of model performance](#sum)
* [Model Evaluation](#ev)
    * [Precision, Recall, F1-score](#f1)
    * [Confusion Matrix](#cm)
* [Predictions](#pred)

<a id='obj'></a>
## Goal:<br>
Classify Farmer queries into predefined categories.<br><br>
Classification algorithms: Linear Support Vector Machine (LinearSVM), Random Forest, Multinomial Naive Bayes and Logistic Regression.<br><br>

<a id='imp'></a>
## Importing packages and loading data

In [None]:
# Input data files are available in the "../input/" directory.
import os
import nltk
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns # used for plot interactive graph. 
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# loading data
df = pd.read_csv('bbc.csv',encoding='ANSI')
df.shape

<a id='eda'></a>
## Exploratory Data Analysis (EDA) and Feature Engineering

In [None]:
df.head(2).T # Columns are shown in rows for easy reading

The dataset contains features that are not necessary to solve our multi-classification problem. For this text classification problem, we are going to build another dataframe that contains ‘Query’ and ‘Type’.

In [None]:
# Create a new dataframe with two columns
df1 = df[['Type', 'Query']].copy()

# Remove missing values (NaN)
df1 = df1[pd.notnull(df1['Query'])]

df1.shape

In [None]:
# Percentage of complaints with text
total = df1['Query'].notnull().sum()
round((total/len(df)*100),1)

In [None]:
pd.DataFrame(df.Type.unique()).values

In [None]:
pd.DataFrame(df1.Type.unique())

 <br><br>Now we need to represent each class as a number, so as our predictive model can better understand the different categories.

In [None]:
# Create a new column 'category_id' with encoded categories 
df1['category_id'] = df1['Type'].factorize()[0]
category_id_df = df1[['Type', 'category_id']].drop_duplicates()


# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Type']].values)

# New dataframe
df1.head()

In [None]:
# import the necessary libraries
import nltk
import string
import re

In [None]:
#to lowercase
df1["Query"]=df1["Query"].str.lower()
df1["Query"]

In [None]:
# Remove numbers
query1 = []
for line in df1["Query"]:
    result = re.sub(r'\d+', '', line)
    query1.append(result)
df1["Query"]=query1
df1["Query"]

In [None]:
# remove punctuation
query2 = []
for line in df1["Query"]:
    translator = str.maketrans('', '', string.punctuation)
    line.translate(translator)
    query2.append(line)
df1["Query"]=query2
df1["Query"]

In [None]:
# remove whitespace from text
query3 = []
for line in df1["Query"]:
     " ".join(line.split())
     query3.append(line)
df1["Query"]=query3
df1["Query"]

In [None]:
# import nltk
# nltk.download('punkt')

In [None]:
# import nltk
# nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# remove stopwords function
query_ = []
for line in df1["Query"]:
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(line)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    query_.append(filtered_text)
df1["Query"]=query_
df1["Query"]

In [None]:
#convert a list to string
query_ = []
for line in df1["Query"]:
    str1 = ""
    for ele in line:
      str1 += ele+" "
    query_.append(str1)
df1["Query"]=query_
df1["Query"]

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
# lemmatize string
query5 = []
for line in df1["Query"]:
    word_tokens = word_tokenize(line)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    query5.append(lemmas)
df1["Query"]=query5
df1["Query"]

In [None]:
#convert a list to string
query_ = []
for line in df1["Query"]:
    str1 = ""
    for ele in line:
      str1 += ele+" "
    query_.append(str1)
df1["Query"]=query_
df1["Query"]

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

# stem words in the list of tokenised words
query4 = []
for line in df1["Query"]:
    word_tokens = word_tokenize(line)
    stems = [stemmer.stem(word) for word in word_tokens]
    query4.append(stems)
df1["Query"]=query4
df1["Query"]

In [None]:
#convert a list to string
query_ = []
for line in df1["Query"]:
    str1 = ""
    for ele in line:
      str1 += ele+" "
    query_.append(str1)
df1["Query"]=query_
df1["Query"]

The bar chart below shows the number of text samples per category. It can be observed that The bar chart below shows the number of text samples per category.

In [None]:
fig = plt.figure(figsize=(8,6))
colors = ['grey','green','red','orange',
    'darkblue','blue']
df1.groupby('Type').Query.count().sort_values().plot.barh(
    ylim=0, color=colors, title= 'NUMBER OF QUERY IN EACH CATEGORY\n', fontsize = 12)
plt.xlabel('Number of ocurrences', fontsize = 20);

<a id='pre'></a>
## Text Preprocessing

The text needs to be transformed to vectors so as the algorithms will be able make predictions. In this case it will be used the Term Frequency – Inverse Document Frequency (TFIDF) weight to evaluate __how important a word is to a document in a collection of documents__.

After removing __punctuation__ and __lower casing__ the words, importance of a word is determined in terms of its frequency.

### “Term Frequency – Inverse Document Frequency 

__TF-IDF__ is the product of the __TF__ and __IDF__ scores of the term.<br><br> $$\text{TF-IDF}=\frac{\text{TF}}{\text{IDF}}$$<br>

__Term Frequency :__ This summarizes how often a given word appears within a document.

$$\text{TF} = \frac{\text{Number of times the term appears in the doc}}{\text{Total number of words in the doc}}$$<br><br>
__Inverse Document Frequency:__ This downscales words that appear a lot across documents. A term has a high IDF score if it appears in a few documents. Conversely, if the term is very common among documents (i.e., “the”, “a”, “is”), the term would have a low IDF score.<br>

$$\text{IDF} = \ln\left(\frac{\text{Number of docs}}{\text{Number docs the term appears in}} \right)$$<br>

TF-IDF are word frequency scores that try to highlight words that are more interesting, e.g. frequent in a document but not across documents. The higher the TFIDF score, the rarer the term is. For instance, in a Weather query the word _rainfall_ would be mentioned fairly often. However, if we look at other queries, _rainfall_ probably would not show up in many of them. We can infer that _rainfall_ is most probably an important word in Weather queries as compared to the other categories. Therefore, _rainfall_ would have a high TF-IDF score for Weather queries.

TfidfVectorizer class can be initialized with the following parameters:
* __min_df__: remove the words from the vocabulary which have occurred in less than ‘min_df’ number of files.
* __max_df__: remove the words from the vocabulary which have occurred in more than _‘max_df’ * total number of files in corpus_.
* __sublinear_tf__: set to True to scale the term frequency in logarithmic scale.
* __stop_words__: remove the predefined stop words in 'english'.
* __use_idf__: weight factor must use inverse document frequency.
* __ngram_range__: (1, 2) to indicate that unigrams and bigrams will be considered.

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(0, 2))

# We transform each complaint into a vector
features = tfidf.fit_transform(df1.Query).toarray()

labels = df1.category_id

print("Each of the %d Text samples is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

In [None]:
# Finding the three most correlated terms with each of the product categories
N = 3
for Type, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("\n==> %s:" %(Type))
  print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
  print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))

<a id='ml'></a>
## Multi-Classification models

The classification models evaluated are: 
* Random Forest
* Linear Support Vector Machine
* Multinomial Naive Bayes 
* Logistic Regression.

<a id='sp'></a>
### Spliting the data into train and test sets

The original data was divided into features (X) and target (y), which were then splitted into train (75%) and test (25%) sets. Thus, the algorithms would be trained on one set of data and tested out on a completely different set of data (not seen before by the algorithm).

In [None]:
X = df1['Query'] # Collection of documents
y = df1['Type'] # Target or the labels we want to predict (i.e., the 13 different complaints of products)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state = 0)

<a id='m'></a>
### Models

In [None]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

<a id='sum'></a>
## Comparison of model performance

The best mean acuracy was obtained with LinearSVC.

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='model_name', y='accuracy', 
            data=cv_df, 
            color='lightblue', 
            showmeans=True)
plt.title("MEAN ACCURACY (cv = 5)\n", size=14);

<a id='ev'></a>
## Model Evaluation

In [None]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features,labels, 
                                                                               df1.index, test_size=0.20, 
                                                                               random_state=1)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

<a id='m'></a>
### Precision, Recall, F1-score

In [None]:
# Classification report
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred,target_names= df1['Type'].unique()))

It is possible to observe that the classes with more support (number of occurrences) tend to have a better f1-cscore. This is because the algorithm was trained with more data.<br><br>

<a id='cm'></a>
### Confusion Matrix

A Confusion Matrix is a table which rows represent the actual class and columns represents the predicted class.<br><br>
If we had a perfect model that always classifies correctly a new Query, then the confusion matrix would have values in the diagonal only (where predicted label = actual label).

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
            xticklabels=category_id_df.Type.values, 
            yticklabels=category_id_df.Type.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - LogisticRegression\n", size=16);

In general, the confusion matrix looks good (clear diagonal that represents correct classifications). Nevertheless, there are cases were the complaint was classified in a wrong class.

#### Most correlated terms with each category

In [None]:
model.fit(features, labels)

N = 4
for Product, category_id in sorted(category_to_id.items()):
  indices = np.argsort(model.coef_[category_id])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
  print("\n==> '{}':".format(Product))
  print("  * Top unigrams: %s" %(', '.join(unigrams)))
  print("  * Top bigrams: %s" %(', '.join(bigrams)))

<a id='pred'></a>
## Predictions

Now let's make a few predictions on unseen data.<br>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(0, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model = LogisticRegression(random_state=0).fit(tfidf_vectorizer_vectors, y_train)

In [None]:
new_Query = ""
print(model.predict(fitted_vectorizer.transform([new_Query])))

## LinearSVC

In [None]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features,labels, 
                                                                               df1.index, test_size=0.20, 
                                                                               random_state=1)
model2 = LinearSVC()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

In [None]:
# Classification report
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred,target_names= df1['Type'].unique()))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
            xticklabels=category_id_df.Type.values, 
            yticklabels=category_id_df.Type.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - LinearSVC\n", size=16);

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(0, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model2 = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)

In [None]:
new_Query2 = ""
print(model2.predict(fitted_vectorizer.transform([new_Query2])))

## MultinomialNB

In [None]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features,labels, 
                                                                               df1.index, test_size=0.20, 
                                                                               random_state=1)
model3 = MultinomialNB()
model3.fit(X_train, y_train)
y_pred = model3.predict(X_test)

In [None]:
# Classification report
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred,target_names= df1['Type'].unique()))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
            xticklabels=category_id_df.Type.values, 
            yticklabels=category_id_df.Type.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - MultinomialNB\n", size=16);

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(0, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model3 = MultinomialNB().fit(tfidf_vectorizer_vectors, y_train)

In [None]:
new_Query3 = ""
print(model3.predict(fitted_vectorizer.transform([new_Query3])))

## RandomForest

In [None]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features,labels, 
                                                                               df1.index, test_size=0.20, 
                                                                               random_state=1)
model4 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)

In [None]:
# Classification report
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred,target_names= df1['Type'].unique()))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
            xticklabels=category_id_df.Type.values, 
            yticklabels=category_id_df.Type.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - RandomForest\n", size=16);

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(0, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model4 = RandomForestClassifier().fit(tfidf_vectorizer_vectors, y_train)

In [None]:
new_Query4 = ""
print(model4.predict(fitted_vectorizer.transform([new_Query4])))

In [None]:
#df1[df1['Query'] == new_Query]