## TFIDF and text classification for Standford Movie data

### Outline
 - Compute TFIDF score for each reviews
 
     1. Load data
     2. Tokenize text
     3. Fit to compute TFIDF
     
 
 - calssify the review(Positive/negative) by TFIDF score
     1. logistic regression
     2. SVM (slow for large dataset)
     3. RandomForest
     4. xgboosting
 
 About the dataset: positive - grade >= 7.0; negative - grade <= 4.0;

In [1]:
from pathlib import Path
import re

from collections import Counter
import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
from sklearn.feature_extraction.text import TfidfVectorizer

### Compute TFIDF

#### 1. Load data

In [2]:
def  getfilelist(root):
    """Return a fully-qualified list of filenames under root directory"""
    path = Path(root)
    textfiles = path.glob('**/*.txt')
    return [str(line) for line in textfiles]

In [3]:
def gettext(filename):
    """Return a string text from given txt file"""
    with open(filename) as f:
        text = f.read().replace("<br />", " ")
    return text

In [4]:
def gettexttodf(rootpath):
    """Extract text from all the txt files under given directory
       and return a dataframe with string reviews and labels"""
    filename_list = getfilelist(rootpath)
    # identify data from positive or negative dataset
    if "neg" in rootpath:
        label = np.zeros(len(filename_list), dtype=int)
    else:
        label = np.ones(len(filename_list), dtype=int)
        
    corpus_dict = {'labels':label}
    text_list = []
    # read text from txt list
    for filename in filename_list:
        text_list.append(gettext(filename))
    corpus_dict['reviews'] = text_list
    # convert to dataframe['review', 'labels']    
    train_data = pd.DataFrame.from_dict(corpus_dict)
    
    return train_data

In [5]:
neg_path = "aclImdb/small_data/neg/"
pos_path = "aclImdb/small_data/pos/"
# create dataframe from labeled txt
neg = gettexttodf(neg_path)
pos = gettexttodf(pos_path)

In [6]:
data = pd.concat([neg, pos],ignore_index=True)

In [7]:
data.labels.value_counts()

1    400
0    400
Name: labels, dtype: int64

#### 2. Tokenize text

In [8]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    text = text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if (len(w) > 2) and (w not in ENGLISH_STOP_WORDS)]  # ignore a, an, to, at, be, ...
    return tokens

In [9]:
def normalizewords(words):
    """
    Given a list of tokens/words, return a new list of normalize words
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    normal = []
    for word, tag in nltk.pos_tag(words):
        wtag = tag[0].lower()
        wtag = wtag if wtag in ['a', 'r', 'n', 'v'] else None
        lemma = lemmatizer.lemmatize(word, wtag) if wtag else word
        normal.append(lemma)
    return ' '.join(normal)

In [10]:
# update df wtth normalized and tokenized words string
for i in range(len(data)):
    data.loc[i,'reviews'] = normalizewords(tokenize(data.loc[i,'reviews']))

#### 3. TFIDF-vectorize

In [11]:
tfidf = TfidfVectorizer(analyzer = "word", max_features=20000)
tfidf.fit(data["reviews"])

def transform_data(tfidf, dataset):
    features = tfidf.transform(dataset["reviews"])
    return pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())

In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.3)

In [13]:
X_tfidf_train = transform_data(tfidf, train)
X_tfidf_test = transform_data(tfidf, test)

In [14]:
y_train = train['labels']
y_test = test['labels']

In [15]:
y_test.value_counts()

0    121
1    119
Name: labels, dtype: int64

### Model trainning

#### 1. model fitting - logistic regression

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lr = LogisticRegression(C=10000, penalty='l2', multi_class='ovr')
lr.fit(X_tfidf_train, y_train)

LogisticRegression(C=10000, multi_class='ovr')

In [19]:
print(classification_report(y_test, lr.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90       121
           1       0.87      0.96      0.91       119

    accuracy                           0.91       240
   macro avg       0.91      0.91      0.91       240
weighted avg       0.91      0.91      0.91       240



In [20]:
print('train:',lr.score(X_tfidf_train,y_train), 'test:', round(lr.score(X_tfidf_test, y_test),3))

train: 1.0 test: 0.908


#### 2. model fitting - SVM

In [21]:
from sklearn import svm

In [22]:
svm_model = svm.SVC(kernel = 'rbf')
svm_model.fit(X_tfidf_train, y_train)

SVC()

In [23]:
print(classification_report(y_test, svm_model.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.91      0.79      0.85       121
           1       0.81      0.92      0.87       119

    accuracy                           0.86       240
   macro avg       0.86      0.86      0.86       240
weighted avg       0.86      0.86      0.86       240



In [24]:
print('train:',svm_model.score(X_tfidf_train,y_train), 'test:', round(svm_model.score(X_tfidf_test, y_test),3))

train: 1.0 test: 0.858


#### 3. model fitting -  Randomforest

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [26]:
params_to_test = {
    'n_estimators':[100,150,200,250],
    'max_depth':[5,6,7,9]
}

In [27]:
rf=RandomForestClassifier(random_state=1)

In [28]:
# cv forto get the best paramters
grid_search = GridSearchCV(rf, param_grid=params_to_test, cv=10, scoring='accuracy', n_jobs=4)

grid_search.fit(X_tfidf_train, y_train)

best_params = grid_search.best_params_

In [29]:
best_params

{'max_depth': 7, 'n_estimators': 100}

In [37]:
rf=RandomForestClassifier(**best_params)
rf.fit(X_tfidf_train, y_train)

RandomForestClassifier(max_depth=7)

In [38]:
print(classification_report(y_test, rf.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.85      0.76      0.80       121
           1       0.78      0.87      0.82       119

    accuracy                           0.81       240
   macro avg       0.82      0.81      0.81       240
weighted avg       0.82      0.81      0.81       240



In [39]:
print('train:',round(rf.score(X_tfidf_train,y_train),3), 'test:', round(rf.score(X_tfidf_test, y_test),3))

train: 0.964 test: 0.812


#### 4. model fitting - Naive Bayes

In [33]:
from sklearn.naive_bayes import MultinomialNB

In [34]:
nb = MultinomialNB()
nb.fit(X_tfidf_train, y_train)

MultinomialNB()

In [35]:
print(classification_report(y_test, nb.predict(X_tfidf_test)))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       121
           1       0.94      0.90      0.92       119

    accuracy                           0.92       240
   macro avg       0.92      0.92      0.92       240
weighted avg       0.92      0.92      0.92       240



In [36]:
print('train:',round(nb.score(X_tfidf_train,y_train),3), 'test:', round(nb.score(X_tfidf_test, y_test),3))

train: 0.989 test: 0.921
