## TFIDF and text classification for Standford Movie data

### Outline
 - Compute TFIDF score for each reviews
 
     1. Load data
     2. Tokenize text
     3. Fit to compute TFIDF
     
 
 - calssify the review(Positive/negative) by TFIDF score
     1. logistic regression
     2. SVM (slow for large dataset)
     3. RandomForest
     4. Naive Bayes
 
 About the dataset: positive - grade >= 7.0; negative - grade <= 4.0;

In [165]:
from pathlib import Path
import re

from collections import Counter
import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
from sklearn.feature_extraction.text import TfidfVectorizer

### Compute TFIDF

#### 1. Load data

In [4]:
def  getfilelist(root):
    """Return a fully-qualified list of filenames under root directory"""
    path = Path(root)
    textfiles = path.glob('**/*.txt')
    return [str(line) for line in textfiles]

In [5]:
def gettext(filename):
    """Return a string text from given txt file"""
    with open(filename) as f:
        text = f.read().replace("<br />", " ")
    return text

In [6]:
def gettexttodf(rootpath):
    """Extract text from all the txt files under given directory
       and return a dataframe with string reviews and labels"""
    filename_list = getfilelist(rootpath)
    # identify data from positive or negative dataset
    if "neg" in rootpath:
        label = np.zeros(len(filename_list), dtype=int)
    else:
        label = np.ones(len(filename_list), dtype=int)
        
    corpus_dict = {'labels':label}
    text_list = []
    # read text from txt list
    for filename in filename_list:
        text_list.append(gettext(filename))
    corpus_dict['reviews'] = text_list
    # convert to dataframe['review', 'labels']    
    train_data = pd.DataFrame.from_dict(corpus_dict)
    
    return train_data

In [166]:
# neg_path = "aclImdb/small_data/neg/"
# pos_path = "aclImdb/small_data/pos/"
neg_path = "aclImdb/train/neg/"
pos_path = "aclImdb/train/pos/"
# create dataframe from labeled txt
neg = gettexttodf(neg_path)
pos = gettexttodf(pos_path)

In [167]:
data = pd.concat([neg, pos],ignore_index=True)

In [168]:
data.labels.value_counts()

1    12500
0    12500
Name: labels, dtype: int64

#### 2. Tokenize text

In [18]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    text = text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if (len(w) > 2) and (w not in ENGLISH_STOP_WORDS)]  # ignore a, an, to, at, be, ...
    return tokens

In [19]:
def normalizewords(words):
    """
    Given a list of tokens/words, return a new list of normalize words
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    normal = []
    for word, tag in nltk.pos_tag(words):
        wtag = tag[0].lower()
        wtag = wtag if wtag in ['a', 'r', 'n', 'v'] else None
        lemma = lemmatizer.lemmatize(word, wtag) if wtag else word
        normal.append(lemma)
    return ' '.join(normal)

In [169]:
# update df wtth normalized and tokenized words string
for i in range(len(data)):
    data.loc[i,'reviews'] = normalizewords(tokenize(data.loc[i,'reviews']))

#### 3. TFIDF-vectorize

In [170]:
tfidf = TfidfVectorizer(analyzer = "word", max_features=20000)
tfidf.fit(data["reviews"])

def transform_data(tfidf, dataset):
    features = tfidf.transform(dataset["reviews"])
    return pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())

In [171]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.3)

In [172]:
X_tfidf_train = transform_data(tfidf, train)
X_tfidf_test = transform_data(tfidf, test)

In [173]:
y_train = train['labels']
y_test = test['labels']

In [174]:
y_test.value_counts()

0    3764
1    3736
Name: labels, dtype: int64

### Model trainning

#### 1. model fitting - logistic regression

In [175]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [176]:
from sklearn.linear_model import LogisticRegression

In [177]:
lr = LogisticRegression(C=10000, penalty='l2', multi_class='ovr')
lr.fit(X_tfidf_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=10000, multi_class='ovr')

In [178]:
print(classification_report(y_test, lr.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      3764
           1       0.86      0.87      0.86      3736

    accuracy                           0.86      7500
   macro avg       0.86      0.86      0.86      7500
weighted avg       0.86      0.86      0.86      7500



In [179]:
print('train:',lr.score(X_tfidf_train,y_train), 'test:', round(lr.score(X_tfidf_test, y_test),3))

train: 1.0 test: 0.862


#### 2. model fitting - SVM

In [31]:
from sklearn import svm

In [148]:
svm_model = svm.SVC(kernel = 'rbf')
svm_model.fit(X_tfidf_train, y_train)

SVC()

In [149]:
print(classification_report(y_test, svm_model.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.94      0.83      0.88       116
           1       0.86      0.95      0.90       124

    accuracy                           0.89       240
   macro avg       0.90      0.89      0.89       240
weighted avg       0.90      0.89      0.89       240



In [150]:
print('train:',svm_model.score(X_tfidf_train,y_train), 'test:', round(svm_model.score(X_tfidf_test, y_test),3))

train: 1.0 test: 0.892


#### 3. model fitting -  Randomforest

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [180]:
params_to_test = {
    'n_estimators':[200,250,300],
    'max_depth':[7,9,10]
}

In [181]:
rf=RandomForestClassifier(random_state=1)

In [None]:
# cv forto get the best paramters
grid_search = GridSearchCV(rf, param_grid=params_to_test, scoring='accuracy', n_jobs=4)

grid_search.fit(X_tfidf_train, y_train)

best_params = grid_search.best_params_

In [193]:
best_params

{'max_depth': 9, 'n_estimators': 300}

In [194]:
rf=RandomForestClassifier(**best_params)
rf.fit(X_tfidf_train, y_train)

RandomForestClassifier(max_depth=9, n_estimators=300)

In [195]:
print(classification_report(y_test, rf.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.88      0.77      0.82      3764
           1       0.80      0.89      0.84      3736

    accuracy                           0.83      7500
   macro avg       0.84      0.83      0.83      7500
weighted avg       0.84      0.83      0.83      7500



In [196]:
print('train:',round(rf.score(X_tfidf_train,y_train),3), 'test:', round(rf.score(X_tfidf_test, y_test),3))

train: 0.875 test: 0.832


#### 4. model fitting - Naive Bayes

In [197]:
from sklearn.naive_bayes import MultinomialNB

In [204]:
nb = MultinomialNB()
nb.fit(X_tfidf_train, y_train)

MultinomialNB()

In [205]:
print(classification_report(y_test, nb.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.86      0.87      0.86      3764
           1       0.86      0.86      0.86      3736

    accuracy                           0.86      7500
   macro avg       0.86      0.86      0.86      7500
weighted avg       0.86      0.86      0.86      7500



In [206]:
print('train:',round(nb.score(X_tfidf_train,y_train),3), 'test:', round(nb.score(X_tfidf_test, y_test),3))

train: 0.905 test: 0.86
